From 062403f6bfc0e0b8a68dfa4c1ccd78c13caa08e5 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Sat, 8 May 2021 00:44:33 +0800
Subject: [PATCH 01/96] add .h5 file in count.py

---
 celescope/tools/count.py | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index 902e7524..e7ed3167 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -12,6 +12,7 @@ import subprocess
 from scipy.io import mmwrite
 from scipy.sparse import csr_matrix, coo_matrix
 import pysam
+import h5py
 from celescope.tools.utils import add_log, format_number, glob_genomeDir, gene_convert, s_common, add_mem
 from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3
 from celescope.tools.__init__ import MATRIX_FILE_NAME, FEATURE_FILE_NAME, BARCODE_FILE_NAME
@@ -262,6 +263,26 @@ def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None
     genes.to_csv(f'{matrix_10X_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False)
     barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t')
     mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx)
+
+    bc_list = df_UMI.index.levels[1].tolist()
+    gene_name = list(genes['gene_name'])
+    gene_id = list(genes['gene_id'])
+    X_data, X_indices, X_indptr = df_UMI.UMI, df_UMI.index.labels[0], df_UMI.index.labels[1]
+
+    f = h5py.File(f"{outdir}/{sample}.h5", "w")
+    dt = h5py.string_dtype(encoding='utf-8')
+    g1 = f.create_group('obs')
+    g1_d1 = g1.create_dataset('_index', data=bc_list, dtype=dt)
+    g2 = f.create_group('var')
+    g2_d1 = g2.create_dataset('_index', data=gene_name, dtype=dt)
+    g2_d2 = g2.create_dataset('gene_ids', data=gene_id, dtype=dt)
+    g3 = f.create_group('X')
+    g3_d1 = g3.create_dataset('data', data=X_data)
+    g3_d2 = g3.create_dataset('indices', data=X_indices)
+    g3_d3 = g3.create_dataset('indptr', data=X_indptr)
+
+    f.close()
+
     return matrix_10X_dir
 
 
-- 
Gitee


From 2c0e59ffd3f715ebe08e8782c4bd3ebc0e864640 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Sat, 8 May 2021 00:45:40 +0800
Subject: [PATCH 02/96] auto

---
 .DS_Store           | Bin 0 -> 6148 bytes
 celescope/.DS_Store | Bin 0 -> 8196 bytes
 2 files changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 .DS_Store
 create mode 100644 celescope/.DS_Store

diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..fde5083c0902b4b0b5ff087a8db22e54d5043b62
GIT binary patch
literal 6148
zcmeHK!EVz)5S>i}by5mBAkhnwFWjmqfeMKOQko`6k!Xc*1qVQ>otVbLwWHV}gb<WF
zUjPSwf$!i8_!ds^W_LlclS-TrqC3*;8}H7uHP7o^FA<66xL+r#5RnCEESAx%F&<}M
zv4ZZo0u=Hd3H2zZA^DwfCEIiu1&jj!ngaaomhjd36vCIj-?-<6v4>G!8usDUAJP!5
zWq6Zhs2N|qVHhN7t@ce6Hj0-nZ(3H_DsPuxID>fNq;5J+n(pWsUp;cufv4l+I1Ycu
zuJ<JHC*8{RgE&gvAo53&9Q1vpJbxNQ{dm%h$5B6#_i@yKWtFT_x3V*v)oXjZ_Fla<
z-?eA=Zr2dsxjUbitQ)uL_d6$N)3>vC^Y<TRF(j~fNxP=;7komnMx58qC<^1~6r;@C
zUX_okG^H~NJ7Gq~FUVe&@v2-dBkGe&T`Zfg=oGJ@N-4deG|Lq?bKEkuu+;Xz@o7Xa
zDc~2F<9x_*ig*%pl3d>RT;3)<qz)ZWn~un#b~d}fbuU-3MTeLmM_@bv!=YnT5Kxq5
z$tm<XGld=_dmmLC0;^D~pJT9E%M1(J_|RqyrK<30HeV`qettrOTt2I_oR7<kWqR6M
zTNkTrQ>{_JDDVdg@cLlG85<fG3gyy)MjioxMRZF;%s&jwu{Aa{E)-$}CKMH@s6rnx
zgrcM0)_Dz$3x$eKLLWYaezMRf6d|9E``a>|L_?uzjRHo2tO6VAvcmg+`~35NmSnDs
z0!D%VN&!)9Ijts2(tGPlalF^s@Dn&2=PeXU3L3o~%Ye7y9k?`%*&F~H8W#%D12aDY
NQU=o)1^%i6KLKwY+LZtR

literal 0
HcmV?d00001

diff --git a/celescope/.DS_Store b/celescope/.DS_Store
new file mode 100644
index 0000000000000000000000000000000000000000..78b57bb8670a1b98f6c056cf4c4e0cce4fd8f49b
GIT binary patch
literal 8196
zcmeHML2nX46n+DgF4QWACUP;^i?^l$ZA^MFL9B<~OfY&-Ll;=fnq?Q4rKBQe@BRWk
z_!s;g{sMoCC;i^cgfI&nycip2$jqDF{k}KzzL_$83lWL-bkHQK6On_;wzh|6L*eIK
z*GfU(Ie-=569tq~7oQ^<uc7S?tAJI&Dqt0`3RngH1qE<sb8)tu`|7D}tpZko|55?|
zesEFQwzSQZm0ulbWC{RV!fjd5M;;(KuC^_0Gi4<eK2`Q0G*##nLnu0)y9|eIX`3ml
z=p+=KgdSPw3`NM%!E+^?L`zxQS_P~E>k4r0z9)0`fFdJ*zd8%!elLuBK=2>cBh;f`
zvY^MTK6%LVQ<_pjK71tb;N@7~a;*I|)+I(w;A3Jk#ho1U2ADpDs1x*JjO@$z47quV
zPdVN$(EMJ4c|N_%jq)>xF3|u{rTI7t7dc#+-te5^Ii|?;DV6%VqEwNFnD-`SOw<S-
zQ`47J!9;CPS;oAIs;ITVnru^-<6dPH5lcj4b95QCiGnh8VWK!*adRc!GW6?X23=?r
z=CIfg2DTzsW`xy*LTZ&v1}k7Nb`%P2e#c<t*N`#RCtO1!`fbSGY%<on{(Em629vbW
z_#q0#(oT8TajH&rzxui}jAxzHOQ(~zm%QYwr(Qbj>Hf1g4wrqe_dM`t-P*z9I7+=B
z@)Jo820kFKUIft~p0(p?G?+*~9reI*Do&+atIy}n#_^$h+-xik-TBGA2Kx6OEEW~#
z_TA>gv-8o_`}v2($In<y6rs;Xx}~m{?HBY|HZ8_wCyBy1;w{aLA~wl@-kP%W(yvO?
z(%?*4IT9Lq3d7p-AG+9L6}V0c6xC$woc}-B{{H_ug=FVy6|f5Yp#q|G+Bt25Bv17X
wWIp9x)N@oWjLVdj6g0*^0=QoOV2Hj8D05ocX3F9YihU6vWw4D^;IAt18w{y8y#N3J

literal 0
HcmV?d00001

-- 
Gitee


From 5307e94a22cb35d3c211e9d4539db0d02246e80a Mon Sep 17 00:00:00 2001
From: seeuzhouxin <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 13:47:31 +0800
Subject: [PATCH 03/96] rm .h5

---
 celescope/tools/count.py | 18 ------------------
 1 file changed, 18 deletions(-)

diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index e7ed3167..61d2545b 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -264,24 +264,6 @@ def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None
     barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t')
     mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx)
 
-    bc_list = df_UMI.index.levels[1].tolist()
-    gene_name = list(genes['gene_name'])
-    gene_id = list(genes['gene_id'])
-    X_data, X_indices, X_indptr = df_UMI.UMI, df_UMI.index.labels[0], df_UMI.index.labels[1]
-
-    f = h5py.File(f"{outdir}/{sample}.h5", "w")
-    dt = h5py.string_dtype(encoding='utf-8')
-    g1 = f.create_group('obs')
-    g1_d1 = g1.create_dataset('_index', data=bc_list, dtype=dt)
-    g2 = f.create_group('var')
-    g2_d1 = g2.create_dataset('_index', data=gene_name, dtype=dt)
-    g2_d2 = g2.create_dataset('gene_ids', data=gene_id, dtype=dt)
-    g3 = f.create_group('X')
-    g3_d1 = g3.create_dataset('data', data=X_data)
-    g3_d2 = g3.create_dataset('indices', data=X_indices)
-    g3_d3 = g3.create_dataset('indptr', data=X_indptr)
-
-    f.close()
 
     return matrix_10X_dir
 
-- 
Gitee


From 6b9005d7cd62cd5975569946ef611c369b96d7b3 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 14:40:29 +0800
Subject: [PATCH 04/96] update to seurat4.0 and add h5

---
 celescope/tools/auto_assign.R  | 21 ++++-----
 celescope/tools/run_analysis.R | 78 ++++++++++++++++++++++------------
 conda_pkgs.txt                 |  2 +-
 3 files changed, 64 insertions(+), 37 deletions(-)

diff --git a/celescope/tools/auto_assign.R b/celescope/tools/auto_assign.R
index efe4a810..f33b8e37 100755
--- a/celescope/tools/auto_assign.R
+++ b/celescope/tools/auto_assign.R
@@ -31,7 +31,7 @@ n_cell_name <- length(cell_name)
 
 #reset
 #all_data <- SetAllIdent(object = all_data, id = origin.cluster)
-clusters <- sort(unique(all_data@ident))
+clusters <- sort(unique(all_data@active.ident))
 
 #create dir
 auto_dir <- stringr::str_glue('{outdir}/{sample}_auto_assign/')
@@ -47,9 +47,9 @@ for (cluster in clusters){
     index = index + 1
     pos = unlist(strsplit(marker_file[index,2,drop=T],","))
     neg = tryCatch(unlist(strsplit(marker_file[index,3,drop=T],",")) ,error=function(e){} )
-    for (feature in pos){
+    for (F in pos){
       tryCatch({
-        dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf)
+        dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf)
         dat$cell_type <- cell
         dat$cluster <- cluster
         dat <- rownames_to_column(dat,var="gene")
@@ -61,13 +61,13 @@ for (cluster in clusters){
           all_dat <- rbind(all_dat,dat)
           }
         }
-        ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) })
+        ,error=function(e){print(paste0(F," not found in cluster ",cluster)) })
     }
 
     if (!is.na(neg) && !is.null(neg)){
-    	for (feature in neg){
+    	for (F in neg){
       	tryCatch({
-        dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf)
+        dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf)
         dat$cell_type <- cell
         dat$cluster <- cluster
         dat <- rownames_to_column(dat,var="gene")
@@ -79,13 +79,14 @@ for (cluster in clusters){
           all_dat <- rbind(all_dat,dat)
           }
         }
-        ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) })
+        ,error=function(e){print(paste0(F," not found in cluster ",cluster)) })
     	}
     }
 
   }
 }
 
+all_dat$cluster <- as.numeric(all_dat$cluster) + 1
 all_dat <- mutate(all_dat,pct.diff=pct.1-pct.2)
 exp.out = stringr::str_glue('{auto_dir}/{sample}_type_marker_exp.tsv')
 write_tsv(all_dat, exp.out)
@@ -109,16 +110,16 @@ for (cluster in clusters){
   dev.off()
 
   png(paste0(png_dir,cluster,"_logfc.png"),width=1200,height=1000)
-  p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_logFC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2)
+  p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_log2FC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2)
   print (p2)
   dev.off()
 }
 
 # auto assign
-exp[exp$type=="negative",]$avg_logFC = -(exp[exp$type=="negative",]$avg_logFC)
+exp[exp$type=="negative",]$avg_log2FC = -(exp[exp$type=="negative",]$avg_log2FC)
 exp[exp$type=="negative",]$pct.diff = -(exp[exp$type=="negative",]$pct.diff)
 a <- group_by(exp,cluster,cell_type)
-as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_logFC),max_p_val_adj=max(p_val_adj))    
+as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_log2FC),max_p_val_adj=max(p_val_adj))
 as1 <- group_by(ungroup(as),cluster)
 as1 <- mutate(as1,pct_rank = rank(avg_pct.diff),
               logfc_rank= rank(avg_logfc),total_rank=pct_rank+logfc_rank)
diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index 5e0b5592..fe4e7415 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -1,6 +1,9 @@
-library(Seurat)
+library(Seurat) # v4.0
 library(tidyverse)
 library(argparser)
+library(hdf5r)
+library(rhdf5)
+
 
 argv <- arg_parser('')
 argv <- add_argument(argv,"--matrix_file", help="matrix file")
@@ -17,18 +20,32 @@ save_rds = argv$save_rds
 resolution = 0.6
 res_str = paste0('res.', resolution)
 
-matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "")
 tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv')
 marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv')
 mito.out = paste(outdir,"stat.txt",sep="/")
 rds.out = paste0(outdir,'/',sample,'.rds')
 
+# read 10X
+matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "")
+rds = CreateSeuratObject(matrix, pro=sample)
 
-rds = CreateSeuratObject(raw.data = matrix,project=sample)
+# generate h5ad file
+x = GetAssayData(rds,slot="count")
+mtx = as.matrix(x)
+barcode = colnames(rds)
+geneid = rownames(rds)
+h5.out = stringr::str_glue('{outdir}/{sample}.h5')
+path <- path.expand(h5.out)
+h5createFile(path)
+h5f <- H5Fopen(path)
+h5writeDataset(mtx,h5f,"X")
+h5writeDataset(barcode,h5f,"obs")
+h5writeDataset(geneid,h5f,"var")
+H5Fclose(h5f)
 
 # mito
-mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@data), value = TRUE, ignore.case=TRUE)
-percent.mito <- Matrix::colSums(rds@raw.data[mito.genes,])/Matrix::colSums(rds@raw.data)
+mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@assays$RNA@data), value = TRUE, ignore.case=TRUE)
+percent.mito <- Matrix::colSums(rds@assays$RNA@counts[mito.genes,])/Matrix::colSums(rds@assays$RNA@counts)
 rds <- AddMetaData(object = rds, metadata = percent.mito, col.name = "percent.mito")
 meta = rds@meta.data
 total_cell = dim(meta)[1]
@@ -43,44 +60,53 @@ mito_df$cell_percent = paste0(round(mito_df$cell_percent * 100,2),"%")
 mito_df$mito_percent = paste0("Fraction of cells have mito gene percent>",round(mito_df$mito_percent * 100,2),"%")
 write_delim(mito_df, mito.out, col_names=F, delim=":")
 
-rds <- NormalizeData(object = rds, normalization.method = "LogNormalize",scale.factor = 10000)
-rds <- FindVariableGenes(object = rds, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.1,  y.cutoff = 1, do.contour=F)
-use.gene = rds@var.genes
-rds <- ScaleData(object = rds,vars.to.regress = c("nUMI", "percent.mito"),genes.use =use.gene)
-rds <- RunPCA(object = rds, pc.genes = use.gene, do.print = FALSE)
-rds <- FindClusters(object = rds, reduction.type = "pca", dims.use = 1:20, resolution = resolution, print.output = 0, save.SNN = TRUE,force.recalc = TRUE)
-rds@meta.data[[res_str]] = as.numeric(rds@meta.data[[res_str]]) + 1
-rds = SetAllIdent(rds, res_str)
-
-# Run Non-linear dimensional reduction (tSNE)
-rds <- RunTSNE(object = rds, dims.use = 1:20, do.fast = TRUE,check_duplicates = FALSE)
+
+rds <- NormalizeData(rds, normalization.method = "LogNormalize",scale.factor = 10000)
+rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 2000, mean.cutoff = c(0.1, 8), dispersion.cutoff = c(1, Inf),
+                            mean.function = ExpMean, dispersion.function = LogVMR)
+
+use.genes <- rds@assays$RNA@var.features
+rds <- ScaleData(rds, vars.to.regress = c("nCount_RNA", "percent.mito"), features = use.genes)
+rds <- RunPCA(object = rds, features = use.genes, do.print = FALSE)
+rds <- FindNeighbors(rds, dims = 1:20, force.recalc = TRUE, reduction = "pca")
+rds <- FindClusters(rds, resolution = resolution)
+
+# tsne and umap
+rds <- RunTSNE(rds, dims = 1:20, do.fast = TRUE, check_duplicates = FALSE)
+
+
 tryCatch({
-  rds.markers <- FindAllMarkers(object = rds, genes.use = use.gene)
-  rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_logFC))
+  rds.markers <- FindAllMarkers(object = rds, features = use.genes)
+  rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_log2FC))
 }, error = function(e){
   print (paste0("no marker found: ", e))
   rds.markers <<- data.frame(cluster=double(),
-                  gene=double(),
-                  avg_logFC=double(),
-                  pct.1=double(),
-                  pct.2=double(),
-                  p_val_adj=double())
+                             gene=double(),
+                             avg_log2FC=double(),
+                             pct.1=double(),
+                             pct.2=double(),
+                             p_val_adj=double())
 
 })
+
+rds.markers$cluster = as.numeric(rds.markers$cluster)
 print (rds.markers)
 write_tsv(rds.markers,marker.out,col_names = T)
 
-df.tsne = rds@dr$tsne@cell.embeddings
+
+df.tsne = rds@reductions$tsne@cell.embeddings
 df.tsne = as.data.frame(df.tsne)
 meta = rds@meta.data
-dic = rds@meta.data[[res_str]]
+dic = rds@meta.data[['seurat_clusters']]
 names(dic) = rownames(rds@meta.data)
 df.tsne$cluster = as.numeric(dic[rownames(df.tsne)])
-df.gene = meta[,"nGene",drop=F]
+rds@meta.data$seurat_clusters = as.numeric(dic[rownames(df.tsne)])
+df.gene = meta[,"nFeature_RNA",drop=F]
 colnames(df.gene) = "Gene_Counts"
 df.all = cbind(df.tsne,df.gene)
 write.table(df.all,tsne.out,sep="\t",col.names=NA,quote = F)
 
+
 if (save_rds == 'True'){
   saveRDS(rds, rds.out)
 }
\ No newline at end of file
diff --git a/conda_pkgs.txt b/conda_pkgs.txt
index f356aa68..3f032270 100644
--- a/conda_pkgs.txt
+++ b/conda_pkgs.txt
@@ -5,7 +5,7 @@ picard=2.18.17
 ucsc-gtftogenepred=377
 subread=2.0.1
 samtools=1.9
-r-seurat=2.3.4
+r-seurat=4.0.1
 r-argparser
 r-tidyverse
 mixcr=3.0.3
-- 
Gitee


From d877b57d4e45ce25cdd6502dd2bb7503d72cf57e Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:15:16 +0800
Subject: [PATCH 05/96] add .DS_store

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 37518ec5..6fb58215 100755
--- a/.gitignore
+++ b/.gitignore
@@ -149,4 +149,7 @@ cython_debug/
 #temp
 /temp/
 
+# .DS_store
+.DS_store
+
 
-- 
Gitee


From 5259474cbec7ea7b310cfa35f52b70eb936a2313 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:15:32 +0800
Subject: [PATCH 06/96] rm h5py

---
 celescope/tools/count.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index 61d2545b..f3540ffa 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -12,7 +12,6 @@ import subprocess
 from scipy.io import mmwrite
 from scipy.sparse import csr_matrix, coo_matrix
 import pysam
-import h5py
 from celescope.tools.utils import add_log, format_number, glob_genomeDir, gene_convert, s_common, add_mem
 from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3
 from celescope.tools.__init__ import MATRIX_FILE_NAME, FEATURE_FILE_NAME, BARCODE_FILE_NAME
-- 
Gitee


From 7a5c11ca873541f93426b125ec667a47630b1be1 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:21:46 +0800
Subject: [PATCH 07/96] merge

---
 celescope/tools/run_analysis.R | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index a761d16c..5883c06f 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -20,10 +20,7 @@ save_rds = argv$save_rds
 resolution = 0.6
 res_str = paste0('res.', resolution)
 
-<<<<<<< HEAD
-=======
 matrix = Seurat::Read10X(matrix_dir, gene.column=2)
->>>>>>> 39b2447c47c4295f6fdbbe970dc21e2e43b5ee5b
 tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv')
 marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv')
 mito.out = paste(outdir,"stat.txt",sep="/")
-- 
Gitee


From 347163dd0f8610f2716fe981abc2fa4ca337b5c0 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:25:23 +0800
Subject: [PATCH 08/96] merge

---
 celescope/tools/count.py | 296 ---------------------------------------
 1 file changed, 296 deletions(-)

diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index ae203eac..4e7711b8 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -253,302 +253,6 @@ class Count(Step):
         df_sum = df_sum.sort_values(col, ascending=False) 
         return df_sum
 
-<<<<<<< HEAD
-            for u in umi_arr:
-                if float(_dict[umi_low]) / _dict[u] > percent:
-                    break
-                if hd(umi_low, u) == 1:
-                    _dict[u] += _dict[umi_low]
-                    del (_dict[umi_low])
-                    break
-        res_dict[geneID] = _dict
-    return res_dict
-
-
-@add_log
-def bam2table(bam, detail_file):
-    # 提取bam中相同barcode的reads，统计比对到基因的reads信息
-    #
-    samfile = pysam.AlignmentFile(bam, "rb")
-    with gzip.open(detail_file, 'wt') as fh1:
-        fh1.write('\t'.join(['Barcode', 'geneID', 'UMI', 'count']) + '\n')
-
-        # pysam.libcalignedsegment.AlignedSegment
-        # AAACAGGCCAGCGTTAACACGACC_CCTAACGT_A00129:340:HHH72DSXX:2:1353:23276:30843
-        # 获取read的barcode
-        def keyfunc(x): return x.query_name.split('_', 1)[0]
-
-        for _, g in groupby(samfile, keyfunc):
-            gene_umi_dict = defaultdict(lambda: defaultdict(int))
-            for seg in g:
-                (barcode, umi) = seg.query_name.split('_')[:2]
-                if not seg.has_tag('XT'):
-                    continue
-                geneID = seg.get_tag('XT')
-                gene_umi_dict[geneID][umi] += 1
-            res_dict = correct_umi(fh1, barcode, gene_umi_dict)
-
-            # output
-            for geneID in res_dict:
-                for umi in res_dict[geneID]:
-                    fh1.write('%s\t%s\t%s\t%s\n' % (barcode, geneID, umi,
-                                                    res_dict[geneID][umi]))
-    samfile.close()
-
-
-@add_log
-def cell_calling(cell_calling_method, force_cell_num, expected_cell_num, all_matrix_10X_dir, df_sum, outdir, sample):
-    if (force_cell_num is not None) and (force_cell_num != 'None'):
-        cell_bc, UMI_threshold = force_cell(force_cell_num, df_sum)
-    elif cell_calling_method == 'auto':
-        cell_bc, UMI_threshold = auto_cell(df_sum, expected_cell_num)
-    elif cell_calling_method == 'cellranger3':
-        cell_bc, UMI_threshold = cellranger3_cell(all_matrix_10X_dir, expected_cell_num, df_sum)
-    elif cell_calling_method == 'inflection':
-        _cell_bc, UMI_threshold = auto_cell(df_sum, expected_cell_num)
-        cell_bc, UMI_threshold = inflection_cell(outdir, sample, all_matrix_10X_dir, df_sum, UMI_threshold)
-    cell_calling.logger.info(f'UMI_threshold: {UMI_threshold}')
-    return cell_bc, UMI_threshold
-
-
-@add_log
-def force_cell(force_cell_num, df_sum):
-    force_cell_num = int(force_cell_num)
-    cell_range = int(force_cell_num * 0.1)
-    cell_low = force_cell_num - cell_range
-    cell_high = force_cell_num + cell_range
-
-    df_barcode_count = df_sum.groupby(
-        ['UMI']).size().reset_index(
-        name='barcode_counts')
-    sorted_df = df_barcode_count.sort_values("UMI", ascending=False)
-    sorted_df["barcode_cumsum"] = sorted_df["barcode_counts"].cumsum()
-    for i in range(sorted_df.shape[0]):
-        if sorted_df.iloc[i, :]["barcode_cumsum"] >= cell_low:
-            index_low = i - 1
-            break
-    for i in range(sorted_df.shape[0]):
-        if sorted_df.iloc[i, :]["barcode_cumsum"] >= cell_high:
-            index_high = i
-            break
-    df_sub = sorted_df.iloc[index_low:index_high + 1, :]
-    threshold = df_sub.iloc[np.argmax(
-        np.diff(df_sub["barcode_cumsum"])), :]["UMI"]
-    cell_bc = get_cell_bc(df_sum, threshold, col='UMI')
-
-    return cell_bc, threshold
-
-
-def find_threshold(df_sum, idx):
-    return int(df_sum.iloc[idx - 1, df_sum.columns == 'UMI'])
-
-
-@add_log
-def auto_cell(df_sum, expected_cell_num):
-    col = "UMI"
-    idx = int(expected_cell_num * 0.01)
-    barcode_number = df_sum.shape[0]
-    idx = int(min(barcode_number, idx))
-    if idx == 0:
-        sys.exit("cell number equals zero!")
-    # calculate read counts threshold
-    threshold = int(find_threshold(df_sum, idx) * 0.1)
-    threshold = max(1, threshold)
-    cell_bc = get_cell_bc(df_sum, threshold)
-
-    return cell_bc, threshold
-
-
-@add_log
-def cellranger3_cell(all_matrix_10X_dir, expected_cell_num, df_sum):
-    cell_bc, initial_cell_num = cell_calling_3(all_matrix_10X_dir, expected_cell_num)
-    threshold = find_threshold(df_sum, initial_cell_num)
-    return cell_bc, threshold
-
-
-@add_log
-def inflection_cell(outdir, sample, all_matrix_10X_dir, df_sum, threshold):
-    app = f'{toolsdir}/rescue.R'
-    cmd = (
-        f'Rscript {app} '
-        f'--matrix_dir {all_matrix_10X_dir} '
-        f'--outdir {outdir} '
-        f'--sample {sample} '
-        f'--threshold {threshold}'
-    )
-    inflection_cell.logger.info(cmd)
-    subprocess.check_call(cmd, shell=True)
-    out_file = f'{outdir}/{sample}_rescue.tsv'
-    df = pd.read_csv(out_file, sep='\t')
-    inflection = int(df.loc[:,'inflection'])
-    threshold = inflection
-    cell_bc = get_cell_bc(df_sum, threshold)
-
-    return cell_bc, threshold
-
-
-@add_log
-def get_df_sum(df, col='UMI'):
-    def num_gt2(x):
-        return pd.Series.sum(x[x > 1])
-
-    df_sum = df.groupby('Barcode').agg({
-        'count': ['sum', num_gt2],
-        'UMI': 'count',
-        'geneID': 'nunique'
-    })
-    df_sum.columns = ['readcount', 'UMI2', 'UMI', 'geneID']
-    df_sum = df_sum.sort_values(col, ascending=False) 
-    return df_sum
-
-def get_cell_bc(df_sum, threshold, col='UMI'):
-    return list(df_sum[df_sum[col] >= threshold].index)
-
-@add_log
-def plot_barcode_UMI(df_sum, threshold, expected_cell_num, cell_num, outdir, sample, cell_calling_method, col='UMI'):
-    out_plot = f'{outdir}/{sample}_barcode_UMI_plot.pdf'
-    import matplotlib
-    matplotlib.use('Agg')
-    import matplotlib.pyplot as plt
-    fig = plt.figure()
-    plt.plot(df_sum['UMI'])
-    plt.hlines(threshold, 0, cell_num, linestyle='dashed')
-    plt.vlines(cell_num, 0, threshold, linestyle='dashed')
-    plt.title('cell_calling_method: %s, expected_cell_num: %s\n %s threshold: %s, cell num: %s' %
-              (cell_calling_method, expected_cell_num, col, threshold, cell_num))
-    plt.loglog()
-    plt.savefig(out_plot)
-
-
-def get_cell_stats(df_sum, cell_bc, marked_counts_file):
-    df_sum.loc[:, 'mark'] = 'UB'
-    df_sum.loc[df_sum.index.isin(cell_bc), 'mark'] = 'CB'
-    df_sum.to_csv(marked_counts_file, sep='\t')
-    CB_describe = df_sum.loc[df_sum['mark'] == 'CB', :].describe()
-
-    return CB_describe
-
-
-def write_matrix_10X(table, id_name, matrix_10X_dir):
-    id = table.index.to_series()
-    name = id.apply(lambda x: id_name[x])
-    genes = pd.concat([id, name], axis=1)
-    genes.columns = ['gene_id', 'gene_name']
-
-    #write
-    table.columns.to_series().to_csv(
-        f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t')
-    genes.to_csv(
-        f'{matrix_10X_dir}/genes.tsv', index=False, header=False, sep='\t')
-    mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table))
-    return id, name
-
-
-@add_log
-def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None):
-    matrix_10X_dir = f"{outdir}/{sample}_{dir_name}/"
-    if not os.path.exists(matrix_10X_dir):
-        os.mkdir(matrix_10X_dir)
-    id_name = gene_convert(gtf_file)
-
-    if cell_bc is not None:
-        df = df.loc[df['Barcode'].isin(cell_bc), :]
-    
-    df_UMI = df.groupby(['geneID','Barcode']).agg({'UMI':'count'})
-    mtx= coo_matrix((df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1])))
-    id = df_UMI.index.levels[0].to_series()
-    # add gene symbol
-    name = id.apply(lambda x: id_name[x])
-    genes = pd.concat([id, name], axis=1)
-    genes.columns = ['gene_id', 'gene_name']
-
-    barcodes = df_UMI.index.levels[1].to_series()
-    genes.to_csv(f'{matrix_10X_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False)
-    barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t')
-    mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx)
-
-
-    return matrix_10X_dir
-
-
-@add_log
-def expression_matrix(df, cell_bc, outdir, sample, gtf_file):
-
-    id_name = gene_convert(gtf_file)
-
-    df.loc[:, 'mark'] = 'UB'
-    df.loc[df['Barcode'].isin(cell_bc), 'mark'] = 'CB'
-    CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique()
-    CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum()
-    reads_mapped_to_transcriptome = df['count'].sum()
-
-    table = df.loc[df['mark'] == 'CB', :].pivot_table(
-        index='geneID', columns='Barcode', values='UMI',
-        aggfunc=len).fillna(0).astype(int)
-
-    # convert id to name; write table matrix
-    matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz"
-    id = table.index.to_series()
-    name = id.apply(lambda x: id_name[x])
-    table.index = name
-    table.index.name = ""
-    table.to_csv(
-        matrix_table_file,
-        sep="\t",
-        compression='gzip')
-    return(CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome)
-
-
-def get_summary(df, sample, Saturation, CB_describe, CB_total_Genes,
-                CB_reads_count, reads_mapped_to_transcriptome,
-                stat_file, outdir):
-
-    # total read
-    json_file = outdir + '.data.json'
-    fh = open(json_file)
-    data = json.load(fh)
-    str_number = data['barcode_summary'][1][1].split("(")[0]
-    valid_read_number = int(str_number.replace(",", ""))
-
-    summary = pd.Series([0, 0, 0, 0, 0, 0, 0],
-                        index=[
-                            'Estimated Number of Cells',
-                            'Fraction Reads in Cells',
-                            'Mean Reads per Cell',
-                            'Median UMI per Cell',
-                            'Total Genes',
-                            'Median Genes per Cell',
-                            'Saturation',
-    ])
-
-    # 细胞数
-    summary['Estimated Number of Cells'] = int(
-        CB_describe.loc['count', 'readcount'])
-    summary['Fraction Reads in Cells'] = '%.2f%%' % (float(
-        CB_reads_count) / reads_mapped_to_transcriptome * 100)
-    summary['Mean Reads per Cell'] = int(
-        valid_read_number /
-        summary['Estimated Number of Cells'])
-    summary['Median UMI per Cell'] = int(CB_describe.loc['50%', 'UMI'])
-    summary['Total Genes'] = int(CB_total_Genes)
-    summary['Median Genes per Cell'] = int(CB_describe.loc['50%', 'geneID'])
-    summary['Saturation'] = '%.2f%%' % (Saturation)
-    # 测序饱和度，认定为细胞中的reads中UMI>2的reads比例
-    need_format = [
-        'Estimated Number of Cells',
-        'Mean Reads per Cell',
-        'Median UMI per Cell',
-        'Total Genes',
-        'Median Genes per Cell']
-    for item in need_format:
-        summary[item] = format_number(summary[item])
-    summary.to_csv(stat_file, header=False, sep=':')
-
-
-@add_log
-def sub_sample(fraction, df_cell, cell_bc, cell_read_index):
-=======
->>>>>>> 39b2447c47c4295f6fdbbe970dc21e2e43b5ee5b
     '''
     @utils.add_log
     def plot_barcode_UMI(df_sum, threshold, expected_cell_num, cell_num, outdir, sample, cell_calling_method, col='UMI'):
-- 
Gitee


From d11da15cef94e649d034d69a25a350b0bee98c45 Mon Sep 17 00:00:00 2001
From: seeuzhouxin <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:40:28 +0800
Subject: [PATCH 09/96] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20.D?=
 =?UTF-8?q?S=5FStore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 .DS_Store | Bin 6148 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 .DS_Store

diff --git a/.DS_Store b/.DS_Store
deleted file mode 100644
index fde5083c0902b4b0b5ff087a8db22e54d5043b62..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 6148
zcmeHK!EVz)5S>i}by5mBAkhnwFWjmqfeMKOQko`6k!Xc*1qVQ>otVbLwWHV}gb<WF
zUjPSwf$!i8_!ds^W_LlclS-TrqC3*;8}H7uHP7o^FA<66xL+r#5RnCEESAx%F&<}M
zv4ZZo0u=Hd3H2zZA^DwfCEIiu1&jj!ngaaomhjd36vCIj-?-<6v4>G!8usDUAJP!5
zWq6Zhs2N|qVHhN7t@ce6Hj0-nZ(3H_DsPuxID>fNq;5J+n(pWsUp;cufv4l+I1Ycu
zuJ<JHC*8{RgE&gvAo53&9Q1vpJbxNQ{dm%h$5B6#_i@yKWtFT_x3V*v)oXjZ_Fla<
z-?eA=Zr2dsxjUbitQ)uL_d6$N)3>vC^Y<TRF(j~fNxP=;7komnMx58qC<^1~6r;@C
zUX_okG^H~NJ7Gq~FUVe&@v2-dBkGe&T`Zfg=oGJ@N-4deG|Lq?bKEkuu+;Xz@o7Xa
zDc~2F<9x_*ig*%pl3d>RT;3)<qz)ZWn~un#b~d}fbuU-3MTeLmM_@bv!=YnT5Kxq5
z$tm<XGld=_dmmLC0;^D~pJT9E%M1(J_|RqyrK<30HeV`qettrOTt2I_oR7<kWqR6M
zTNkTrQ>{_JDDVdg@cLlG85<fG3gyy)MjioxMRZF;%s&jwu{Aa{E)-$}CKMH@s6rnx
zgrcM0)_Dz$3x$eKLLWYaezMRf6d|9E``a>|L_?uzjRHo2tO6VAvcmg+`~35NmSnDs
z0!D%VN&!)9Ijts2(tGPlalF^s@Dn&2=PeXU3L3o~%Ye7y9k?`%*&F~H8W#%D12aDY
NQU=o)1^%i6KLKwY+LZtR

-- 
Gitee


From 4d5869cd1b8bbdaa2ea949fd7bbd961d3ee7fa3b Mon Sep 17 00:00:00 2001
From: seeuzhouxin <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 16:40:36 +0800
Subject: [PATCH 10/96] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20ce?=
 =?UTF-8?q?lescope/.DS=5FStore?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 celescope/.DS_Store | Bin 8196 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 celescope/.DS_Store

diff --git a/celescope/.DS_Store b/celescope/.DS_Store
deleted file mode 100644
index 78b57bb8670a1b98f6c056cf4c4e0cce4fd8f49b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 8196
zcmeHML2nX46n+DgF4QWACUP;^i?^l$ZA^MFL9B<~OfY&-Ll;=fnq?Q4rKBQe@BRWk
z_!s;g{sMoCC;i^cgfI&nycip2$jqDF{k}KzzL_$83lWL-bkHQK6On_;wzh|6L*eIK
z*GfU(Ie-=569tq~7oQ^<uc7S?tAJI&Dqt0`3RngH1qE<sb8)tu`|7D}tpZko|55?|
zesEFQwzSQZm0ulbWC{RV!fjd5M;;(KuC^_0Gi4<eK2`Q0G*##nLnu0)y9|eIX`3ml
z=p+=KgdSPw3`NM%!E+^?L`zxQS_P~E>k4r0z9)0`fFdJ*zd8%!elLuBK=2>cBh;f`
zvY^MTK6%LVQ<_pjK71tb;N@7~a;*I|)+I(w;A3Jk#ho1U2ADpDs1x*JjO@$z47quV
zPdVN$(EMJ4c|N_%jq)>xF3|u{rTI7t7dc#+-te5^Ii|?;DV6%VqEwNFnD-`SOw<S-
zQ`47J!9;CPS;oAIs;ITVnru^-<6dPH5lcj4b95QCiGnh8VWK!*adRc!GW6?X23=?r
z=CIfg2DTzsW`xy*LTZ&v1}k7Nb`%P2e#c<t*N`#RCtO1!`fbSGY%<on{(Em629vbW
z_#q0#(oT8TajH&rzxui}jAxzHOQ(~zm%QYwr(Qbj>Hf1g4wrqe_dM`t-P*z9I7+=B
z@)Jo820kFKUIft~p0(p?G?+*~9reI*Do&+atIy}n#_^$h+-xik-TBGA2Kx6OEEW~#
z_TA>gv-8o_`}v2($In<y6rs;Xx}~m{?HBY|HZ8_wCyBy1;w{aLA~wl@-kP%W(yvO?
z(%?*4IT9Lq3d7p-AG+9L6}V0c6xC$woc}-B{{H_ug=FVy6|f5Yp#q|G+Bt25Bv17X
wWIp9x)N@oWjLVdj6g0*^0=QoOV2Hj8D05ocX3F9YihU6vWw4D^;IAt18w{y8y#N3J

-- 
Gitee


From d2b9a2328f7d36e27aa9520b3b7fd484f087434f Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 10 May 2021 17:50:50 +0800
Subject: [PATCH 11/96] fix

---
 celescope/tools/analysis.py    | 10 +++++-----
 celescope/tools/run_analysis.R |  3 +--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/celescope/tools/analysis.py b/celescope/tools/analysis.py
index 5cfc474a..0de33bf2 100755
--- a/celescope/tools/analysis.py
+++ b/celescope/tools/analysis.py
@@ -35,10 +35,10 @@ def generate_matrix(gtf_file, matrix_file):
 
 
 @add_log
-def seurat(sample, outdir, matrix_file, save_rds):
+def seurat(sample, outdir, matrix_dir, save_rds):
     app = TOOLSDIR + "/run_analysis.R"
     cmd = (
-        f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_file {matrix_file} '
+        f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_dir {matrix_dir} '
         f'--save_rds {save_rds}'
     )
     seurat.logger.info(cmd)
@@ -64,7 +64,7 @@ class Analysis_rna(Step, AnalysisMixin):
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
         AnalysisMixin.__init__(self, args)
-        self.matrix_file = args.matrix_file
+        self.matrix_dir = args.matrix_dir
         self.type_marker_tsv = args.type_marker_tsv
         self.auto_assign_bool = False
         self.save_rds = args.save_rds
@@ -73,7 +73,7 @@ class Analysis_rna(Step, AnalysisMixin):
             self.save_rds = True
 
     def run(self):
-        seurat(self.sample, self.outdir, self.matrix_file, self.save_rds)
+        seurat(self.sample, self.outdir, self.matrix_dir, self.save_rds)
         if self.auto_assign_bool:
             auto_assign(self.sample, self.outdir, self.type_marker_tsv)
         self.run_analysis()
@@ -95,7 +95,7 @@ def analysis(args):
 def get_opts_analysis(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
-        parser.add_argument('--matrix_file', help='matrix file', required=True)
+        parser.add_argument('--matrix_dir', help='matrix dir', required=True)
     parser.add_argument('--save_rds', action='store_true', help='write rds to disk')
     parser.add_argument('--type_marker_tsv', help='cell type marker tsv')
 
diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index 5883c06f..bc49c7fd 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -13,7 +13,7 @@ argv <- add_argument(argv,"--save_rds", help="write rds to disk")
 argv <- parse_args(argv)
 
 #args
-matrix_file = argv$matrix_file
+matrix_dir = argv$matrix_dir
 outdir = argv$outdir
 sample = argv$sample
 save_rds = argv$save_rds
@@ -27,7 +27,6 @@ mito.out = paste(outdir,"stat.txt",sep="/")
 rds.out = paste0(outdir,'/',sample,'.rds')
 
 # read 10X
-matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "")
 rds = CreateSeuratObject(matrix, pro=sample)
 
 # generate h5ad file
-- 
Gitee


From e91ee61875d65903d1b2e6a10186caf46029a329 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 11 May 2021 16:14:08 +0800
Subject: [PATCH 12/96] merge

---
 Dockerfile                                             |  0
 celescope/capture_virus/otsu.py                        |  0
 celescope/capture_virus/test.py                        |  0
 celescope/templates/css/buttons.dataTables.min.css     |  0
 celescope/templates/css/dataTables.jqueryui.min.css    |  0
 celescope/templates/css/jquery-ui-git.css              |  0
 celescope/templates/css/jquery-ui.css                  |  0
 celescope/templates/html/common/consensus_summary.html |  0
 .../templates/html/snp/target_metrics_summary.html     |  0
 celescope/templates/js/buttons.flash.min.js            |  0
 celescope/templates/js/buttons.html5.min.js            |  0
 celescope/templates/js/buttons.print.min.js            |  0
 celescope/templates/js/dataTables.buttons.min.js       |  0
 celescope/templates/js/dataTables.jqueryui.min.js      |  0
 celescope/templates/js/jquery.dataTables.min.js        |  0
 celescope/templates/js/jquery.min.3.3.1.js             |  0
 celescope/templates/js/jszip.min.js                    |  0
 celescope/templates/js/plotly-1.58.4.min.js            |  0
 celescope/tests/__init__.py                            |  0
 celescope/tests/func_tests.py                          |  0
 celescope/tests/multi_tests.py                         |  0
 celescope/tests/test_legacy.py                         |  0
 celescope/tools/Step.py                                |  0
 celescope/tools/analysis.py                            | 10 +++++-----
 celescope/tools/analysisMixin.py                       |  2 +-
 celescope/tools/cellranger3/__init__.py                |  0
 celescope/tools/cellranger3/cell_calling_3.py          |  0
 celescope/tools/cellranger3/get_plot_elements.py       |  0
 celescope/tools/cellranger3/sgt.py                     |  0
 celescope/tools/cellranger3/stats.py                   |  0
 celescope/tools/consensus.py                           |  0
 celescope/tools/rescue.R                               |  0
 celescope/tools/run_analysis.R                         |  6 +++---
 celescope/tools/target_metrics.py                      |  0
 conda_pkgs.txt                                         |  0
 docs/Multi-samples.md                                  |  0
 docs/STAR.md                                           |  0
 docs/analysis.md                                       |  0
 docs/barcode.md                                        |  0
 docs/consensus.md                                      |  0
 docs/count.md                                          |  0
 docs/count_vdj.md                                      |  0
 docs/cutadapt.md                                       |  0
 docs/featureCounts.md                                  |  0
 docs/mapping_vdj.md                                    |  0
 docs/v1.1.8.md                                         |  0
 docs/v1.1.9.md                                         |  0
 47 files changed, 9 insertions(+), 9 deletions(-)
 mode change 100644 => 100755 Dockerfile
 mode change 100644 => 100755 celescope/capture_virus/otsu.py
 mode change 100644 => 100755 celescope/capture_virus/test.py
 mode change 100644 => 100755 celescope/templates/css/buttons.dataTables.min.css
 mode change 100644 => 100755 celescope/templates/css/dataTables.jqueryui.min.css
 mode change 100644 => 100755 celescope/templates/css/jquery-ui-git.css
 mode change 100644 => 100755 celescope/templates/css/jquery-ui.css
 mode change 100644 => 100755 celescope/templates/html/common/consensus_summary.html
 mode change 100644 => 100755 celescope/templates/html/snp/target_metrics_summary.html
 mode change 100644 => 100755 celescope/templates/js/buttons.flash.min.js
 mode change 100644 => 100755 celescope/templates/js/buttons.html5.min.js
 mode change 100644 => 100755 celescope/templates/js/buttons.print.min.js
 mode change 100644 => 100755 celescope/templates/js/dataTables.buttons.min.js
 mode change 100644 => 100755 celescope/templates/js/dataTables.jqueryui.min.js
 mode change 100644 => 100755 celescope/templates/js/jquery.dataTables.min.js
 mode change 100644 => 100755 celescope/templates/js/jquery.min.3.3.1.js
 mode change 100644 => 100755 celescope/templates/js/jszip.min.js
 mode change 100644 => 100755 celescope/templates/js/plotly-1.58.4.min.js
 mode change 100644 => 100755 celescope/tests/__init__.py
 mode change 100644 => 100755 celescope/tests/func_tests.py
 mode change 100644 => 100755 celescope/tests/multi_tests.py
 mode change 100644 => 100755 celescope/tests/test_legacy.py
 mode change 100644 => 100755 celescope/tools/Step.py
 mode change 100644 => 100755 celescope/tools/analysisMixin.py
 mode change 100644 => 100755 celescope/tools/cellranger3/__init__.py
 mode change 100644 => 100755 celescope/tools/cellranger3/cell_calling_3.py
 mode change 100644 => 100755 celescope/tools/cellranger3/get_plot_elements.py
 mode change 100644 => 100755 celescope/tools/cellranger3/sgt.py
 mode change 100644 => 100755 celescope/tools/cellranger3/stats.py
 mode change 100644 => 100755 celescope/tools/consensus.py
 mode change 100644 => 100755 celescope/tools/rescue.R
 mode change 100644 => 100755 celescope/tools/target_metrics.py
 mode change 100644 => 100755 conda_pkgs.txt
 mode change 100644 => 100755 docs/Multi-samples.md
 mode change 100644 => 100755 docs/STAR.md
 mode change 100644 => 100755 docs/analysis.md
 mode change 100644 => 100755 docs/barcode.md
 mode change 100644 => 100755 docs/consensus.md
 mode change 100644 => 100755 docs/count.md
 mode change 100644 => 100755 docs/count_vdj.md
 mode change 100644 => 100755 docs/cutadapt.md
 mode change 100644 => 100755 docs/featureCounts.md
 mode change 100644 => 100755 docs/mapping_vdj.md
 mode change 100644 => 100755 docs/v1.1.8.md
 mode change 100644 => 100755 docs/v1.1.9.md

diff --git a/Dockerfile b/Dockerfile
old mode 100644
new mode 100755
diff --git a/celescope/capture_virus/otsu.py b/celescope/capture_virus/otsu.py
old mode 100644
new mode 100755
diff --git a/celescope/capture_virus/test.py b/celescope/capture_virus/test.py
old mode 100644
new mode 100755
diff --git a/celescope/templates/css/buttons.dataTables.min.css b/celescope/templates/css/buttons.dataTables.min.css
old mode 100644
new mode 100755
diff --git a/celescope/templates/css/dataTables.jqueryui.min.css b/celescope/templates/css/dataTables.jqueryui.min.css
old mode 100644
new mode 100755
diff --git a/celescope/templates/css/jquery-ui-git.css b/celescope/templates/css/jquery-ui-git.css
old mode 100644
new mode 100755
diff --git a/celescope/templates/css/jquery-ui.css b/celescope/templates/css/jquery-ui.css
old mode 100644
new mode 100755
diff --git a/celescope/templates/html/common/consensus_summary.html b/celescope/templates/html/common/consensus_summary.html
old mode 100644
new mode 100755
diff --git a/celescope/templates/html/snp/target_metrics_summary.html b/celescope/templates/html/snp/target_metrics_summary.html
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/buttons.flash.min.js b/celescope/templates/js/buttons.flash.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/buttons.html5.min.js b/celescope/templates/js/buttons.html5.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/buttons.print.min.js b/celescope/templates/js/buttons.print.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/dataTables.buttons.min.js b/celescope/templates/js/dataTables.buttons.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/dataTables.jqueryui.min.js b/celescope/templates/js/dataTables.jqueryui.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/jquery.dataTables.min.js b/celescope/templates/js/jquery.dataTables.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/jquery.min.3.3.1.js b/celescope/templates/js/jquery.min.3.3.1.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/jszip.min.js b/celescope/templates/js/jszip.min.js
old mode 100644
new mode 100755
diff --git a/celescope/templates/js/plotly-1.58.4.min.js b/celescope/templates/js/plotly-1.58.4.min.js
old mode 100644
new mode 100755
diff --git a/celescope/tests/__init__.py b/celescope/tests/__init__.py
old mode 100644
new mode 100755
diff --git a/celescope/tests/func_tests.py b/celescope/tests/func_tests.py
old mode 100644
new mode 100755
diff --git a/celescope/tests/multi_tests.py b/celescope/tests/multi_tests.py
old mode 100644
new mode 100755
diff --git a/celescope/tests/test_legacy.py b/celescope/tests/test_legacy.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/Step.py b/celescope/tools/Step.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/analysis.py b/celescope/tools/analysis.py
index 0de33bf2..5cfc474a 100755
--- a/celescope/tools/analysis.py
+++ b/celescope/tools/analysis.py
@@ -35,10 +35,10 @@ def generate_matrix(gtf_file, matrix_file):
 
 
 @add_log
-def seurat(sample, outdir, matrix_dir, save_rds):
+def seurat(sample, outdir, matrix_file, save_rds):
     app = TOOLSDIR + "/run_analysis.R"
     cmd = (
-        f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_dir {matrix_dir} '
+        f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_file {matrix_file} '
         f'--save_rds {save_rds}'
     )
     seurat.logger.info(cmd)
@@ -64,7 +64,7 @@ class Analysis_rna(Step, AnalysisMixin):
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
         AnalysisMixin.__init__(self, args)
-        self.matrix_dir = args.matrix_dir
+        self.matrix_file = args.matrix_file
         self.type_marker_tsv = args.type_marker_tsv
         self.auto_assign_bool = False
         self.save_rds = args.save_rds
@@ -73,7 +73,7 @@ class Analysis_rna(Step, AnalysisMixin):
             self.save_rds = True
 
     def run(self):
-        seurat(self.sample, self.outdir, self.matrix_dir, self.save_rds)
+        seurat(self.sample, self.outdir, self.matrix_file, self.save_rds)
         if self.auto_assign_bool:
             auto_assign(self.sample, self.outdir, self.type_marker_tsv)
         self.run_analysis()
@@ -95,7 +95,7 @@ def analysis(args):
 def get_opts_analysis(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
-        parser.add_argument('--matrix_dir', help='matrix dir', required=True)
+        parser.add_argument('--matrix_file', help='matrix file', required=True)
     parser.add_argument('--save_rds', action='store_true', help='write rds to disk')
     parser.add_argument('--type_marker_tsv', help='cell type marker tsv')
 
diff --git a/celescope/tools/analysisMixin.py b/celescope/tools/analysisMixin.py
old mode 100644
new mode 100755
index df2350d5..5e5e5dbc
--- a/celescope/tools/analysisMixin.py
+++ b/celescope/tools/analysisMixin.py
@@ -50,7 +50,7 @@ class AnalysisMixin():
         return html code
         """
         marker_df = self.marker_df.loc[:,
-            ["cluster", "gene", "avg_logFC", "pct.1", "pct.2", "p_val_adj"]
+            ["cluster", "gene", "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]
         ]
         marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}")
 
diff --git a/celescope/tools/cellranger3/__init__.py b/celescope/tools/cellranger3/__init__.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/cellranger3/cell_calling_3.py b/celescope/tools/cellranger3/cell_calling_3.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/cellranger3/sgt.py b/celescope/tools/cellranger3/sgt.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/cellranger3/stats.py b/celescope/tools/cellranger3/stats.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/consensus.py b/celescope/tools/consensus.py
old mode 100644
new mode 100755
diff --git a/celescope/tools/rescue.R b/celescope/tools/rescue.R
old mode 100644
new mode 100755
diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index bc49c7fd..2df168aa 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -6,21 +6,21 @@ library(rhdf5)
 
 
 argv <- arg_parser('')
-argv <- add_argument(argv,"--matrix_dir", help="cell 10X matrix dir")
+argv <- add_argument(argv,"--matrix_file", help="cell 10X matrix dir")
 argv <- add_argument(argv,"--outdir", help="outdir")
 argv <- add_argument(argv,"--sample", help="sample")
 argv <- add_argument(argv,"--save_rds", help="write rds to disk")
 argv <- parse_args(argv)
 
 #args
-matrix_dir = argv$matrix_dir
+matrix_file = argv$matrix_file
 outdir = argv$outdir
 sample = argv$sample
 save_rds = argv$save_rds
 resolution = 0.6
 res_str = paste0('res.', resolution)
 
-matrix = Seurat::Read10X(matrix_dir, gene.column=2)
+matrix = Seurat::Read10X(matrix_file, gene.column=2)
 tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv')
 marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv')
 mito.out = paste(outdir,"stat.txt",sep="/")
diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py
old mode 100644
new mode 100755
diff --git a/conda_pkgs.txt b/conda_pkgs.txt
old mode 100644
new mode 100755
diff --git a/docs/Multi-samples.md b/docs/Multi-samples.md
old mode 100644
new mode 100755
diff --git a/docs/STAR.md b/docs/STAR.md
old mode 100644
new mode 100755
diff --git a/docs/analysis.md b/docs/analysis.md
old mode 100644
new mode 100755
diff --git a/docs/barcode.md b/docs/barcode.md
old mode 100644
new mode 100755
diff --git a/docs/consensus.md b/docs/consensus.md
old mode 100644
new mode 100755
diff --git a/docs/count.md b/docs/count.md
old mode 100644
new mode 100755
diff --git a/docs/count_vdj.md b/docs/count_vdj.md
old mode 100644
new mode 100755
diff --git a/docs/cutadapt.md b/docs/cutadapt.md
old mode 100644
new mode 100755
diff --git a/docs/featureCounts.md b/docs/featureCounts.md
old mode 100644
new mode 100755
diff --git a/docs/mapping_vdj.md b/docs/mapping_vdj.md
old mode 100644
new mode 100755
diff --git a/docs/v1.1.8.md b/docs/v1.1.8.md
old mode 100644
new mode 100755
diff --git a/docs/v1.1.9.md b/docs/v1.1.9.md
old mode 100644
new mode 100755
-- 
Gitee


From f6dfd7baca55bf81ad4a0ed1f66c188e592925be Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 11 May 2021 18:09:12 +0800
Subject: [PATCH 13/96] avg_logFC to avg_log2FC

---
 celescope/tools/Analysis.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/celescope/tools/Analysis.py b/celescope/tools/Analysis.py
index 21210743..ac4aabde 100755
--- a/celescope/tools/Analysis.py
+++ b/celescope/tools/Analysis.py
@@ -106,7 +106,7 @@ class Analysis():
         return html code
         """
         marker_df = self.marker_df.loc[:, ["cluster", "gene",
-                                    "avg_logFC", "pct.1", "pct.2", "p_val_adj"]]
+                                    "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]]
         marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}")
         return marker_df
 
@@ -124,7 +124,7 @@ class Analysis():
         return html code
         """
         marker_df = self.marker_df.loc[:, ["cluster", "gene",
-                                    "avg_logFC", "pct.1", "pct.2", "p_val_adj"]]
+                                    "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]]
         marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}")
         marker_gene_table = marker_df.to_html(
             escape=False,
-- 
Gitee


From 9fe1f4632bcd19d0a3b6e0691ae03bf5ad001b92 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 11 May 2021 18:34:34 +0800
Subject: [PATCH 14/96] chang FindVariableFeatures parameters

---
 celescope/tools/run_analysis.R | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index 2df168aa..4a050c29 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -62,8 +62,7 @@ write_delim(mito_df, mito.out, col_names=F, delim=":")
 
 
 rds <- NormalizeData(rds, normalization.method = "LogNormalize",scale.factor = 10000)
-rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 2000, mean.cutoff = c(0.1, 8), dispersion.cutoff = c(1, Inf),
-                            mean.function = ExpMean, dispersion.function = LogVMR)
+rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 10000)
 
 use.genes <- rds@assays$RNA@var.features
 rds <- ScaleData(rds, vars.to.regress = c("nCount_RNA", "percent.mito"), features = use.genes)
-- 
Gitee


From 5ddd153d90f4733901ecefb8dc01460e103b1051 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 17 May 2021 13:57:04 +0800
Subject: [PATCH 15/96] fix

---
 celescope/tools/run_analysis.R | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R
index 4a050c29..bb654e94 100755
--- a/celescope/tools/run_analysis.R
+++ b/celescope/tools/run_analysis.R
@@ -35,13 +35,16 @@ mtx = as.matrix(x)
 barcode = colnames(rds)
 geneid = rownames(rds)
 h5.out = stringr::str_glue('{outdir}/{sample}.h5')
-path <- path.expand(h5.out)
-h5createFile(path)
-h5f <- H5Fopen(path)
-h5writeDataset(mtx,h5f,"X")
-h5writeDataset(barcode,h5f,"obs")
-h5writeDataset(geneid,h5f,"var")
-H5Fclose(h5f)
+if (file.exists(h5.out) == FALSE){
+  path <- path.expand(h5.out)
+  h5createFile(path)
+  h5f <- H5Fopen(path)
+  h5writeDataset(mtx,h5f,"X")
+  h5writeDataset(barcode,h5f,"obs")
+  h5writeDataset(geneid,h5f,"var")
+  H5Fclose(h5f)
+}
+
 
 # mito
 mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@assays$RNA@data), value = TRUE, ignore.case=TRUE)
-- 
Gitee


From 8aa87820def4eb6e2dca90fb348e7e35911410ae Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 27 May 2021 15:42:32 +0800
Subject: [PATCH 16/96] add tracer_vdj

---
 celescope/tracer_vdj/__init__.py         |   7 ++
 celescope/tracer_vdj/go_assemble.py      | 140 +++++++++++++++++++++++
 celescope/tracer_vdj/multi_tracer_vdj.py |  57 +++++++++
 celescope/tracer_vdj/split_fastq.py      | 125 ++++++++++++++++++++
 4 files changed, 329 insertions(+)
 create mode 100644 celescope/tracer_vdj/__init__.py
 create mode 100755 celescope/tracer_vdj/go_assemble.py
 create mode 100755 celescope/tracer_vdj/multi_tracer_vdj.py
 create mode 100755 celescope/tracer_vdj/split_fastq.py

diff --git a/celescope/tracer_vdj/__init__.py b/celescope/tracer_vdj/__init__.py
new file mode 100644
index 00000000..eb48cf9f
--- /dev/null
+++ b/celescope/tracer_vdj/__init__.py
@@ -0,0 +1,7 @@
+__STEPS__ = [
+    'sample',
+    'barcode',
+    'cutadapt',
+    'split_fastq',
+    'go_assemble']
+__ASSAY__ = 'tracer_vdj'
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
new file mode 100755
index 00000000..65f4696a
--- /dev/null
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -0,0 +1,140 @@
+import argparse
+import os
+from os import listdir
+from os.path import isfile, join
+from concurrent.futures import ProcessPoolExecutor
+from celescope.tools.utils import add_log
+import datetime
+
+
+TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer'
+CONF_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer.conf'
+BRACER_PATH = '/SGRNJ03/randd/zhouxin/software/bracer/bracer'
+BRACER_CONDA = 'bracer'
+BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
+
+
+# 开始组装
+
+
+def bracer_summarise(outdir):
+    bracer_outdir = f'{outdir}/bracer'
+    cmd = (
+        f'source activate {BRACER_CONDA}; '
+        f'{BRACER_PATH} summarise '
+        f'-c {BRACER_CONF} '
+        f'--no_networks ' 
+        f'{bracer_outdir} '
+        )
+    bracer_summarise.logger.info(cmd)
+    os.system(cmd)
+
+
+def bracer(fq, outdir, species):
+    prefix = os.path.basename(fq).strip('.fq')
+    cmd = (
+        f'source activate {BRACER_CONDA}; '
+        f'{BRACER_PATH} assemble '
+        f'--fragment_length 150 '
+        f'--fragment_sd 5 '
+        f'--single_end '
+        f'--small_index '
+        f'--species {species} '
+        f'-c {BRACER_CONF} '
+        f'{prefix} '
+        f'{outdir}/bracer '
+        f'{fq} '
+    )
+    bracer.logger.info(cmd)
+    os.system(cmd)
+
+
+def tracer_summarise(outdir):
+    tracer_outdir = f'{outdir}/tracer'
+    cmd = (
+        f'source activate {BRACER_CONDA}; '
+        f'{TRACER_PATH} summarise '
+        f'-c {CONF_PATH} '
+        f'--no_networks '
+        f'{tracer_outdir} '
+    )
+    tracer_summarise.logger.info(cmd)
+    os.system(cmd)
+
+
+def tracer(fq, outdir, species):
+    prefix = os.path.basename(fq).strip('.fq')
+    cmd = (
+        f'source activate {BRACER_CONDA}; '
+        f'{TRACER_PATH} assemble '
+        f'--fragment_length 150 '
+        f'--fragment_sd 5 '
+        f'--single_end '
+        f'--small_index '
+        f'-m assembly '
+        f'--species {species} '
+        f'-c {CONF_PATH} '
+        f'{fq} '
+        f'{prefix} '
+        f'{outdir}/tracer '
+    )
+    tracer.logger.info(cmd)
+    os.system(cmd)
+
+
+@add_log
+def run_tracer(outdir, fastq_dir, species, thread):
+
+    fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
+    outdirs = [outdir] * len(fqs)
+    species = [species] * len(fqs)
+    if not os.path.exists(f'{outdir}/tracer'):
+        os.makedirs(f'{outdir}/tracer')
+
+    all_res = []
+    with ProcessPoolExecutor(thread) as pool:
+        for res in pool.map(tracer, fqs, outdirs, species):
+            all_res.append(res)
+
+    tracer_summarise(outdir)
+
+
+@add_log
+def run_bracer(outdir, fastq_dir, species, thread):
+    fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
+    outdirs = [outdir] * len(fqs)
+    species = [species] * len(fqs)
+    if not os.path.exists(f'{outdir}/bracer'):
+        os.makedirs(f'{outdir}/bracer')
+
+    all_res = []
+    with ProcessPoolExecutor(thread) as pool:
+        for res in pool.map(bracer, fqs, outdirs, species):
+            all_res.append(res)
+
+    bracer_summarise(outdir)
+
+
+def go_assemble(args):
+    thread = int(args.thread)
+    fastq_dir = args.fastq_dir
+    outdir = args.outdir
+    species = args.species
+    
+    mode = args.mode
+    if mode == 'TCR':
+        run_tracer(outdir, fastq_dir, species, thread)
+    elif mode == 'BCR':
+        run_bracer(outdir, fastq_dir, species, thread)
+
+
+def get_opts_go_assemble(parser, sub_program):
+    if sub_program:
+        parser.add_argument("--outdir", help="assemble outdir", required=True)
+        parser.add_argument("--sample", help="vdj sample name", required=True)
+        parser.add_argument('--assay', help='assay', required=True)
+        parser.add_argument('--fastq_dir', required=True)
+    parser.add_argument('--mode', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+    parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
+    parser.add_argument('--thread', help='thread', default=20)
+
diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py
new file mode 100755
index 00000000..97beed69
--- /dev/null
+++ b/celescope/tracer_vdj/multi_tracer_vdj.py
@@ -0,0 +1,57 @@
+from celescope.tracer_vdj.__init__ import __STEPS__, __ASSAY__
+from celescope.tools.Multi import Multi
+
+
+class Multi_tracer_vdj(Multi):
+    def custome_args(self):
+        self.parser.add_argument('--thread', help='thread', default=20)
+        self.parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'])
+        self.parser.add_argument('--species', help='species name', choices=['Hsap', 'Mmus'])
+
+    def read_custome_args(self):
+        self.thread = self.args.thread
+        self.mode = self.args.mode
+        self.species = self.args.species
+
+    def split_fastq(self, sample):
+        step = 'split_fastq'
+        fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}'
+        cmd = (
+            f'{self.__APP__} '
+            f'{self.__ASSAY__} '
+            f'{step} '
+            f'--outdir {self.outdir_dic[sample][step]} '
+            f'--sample {sample} '
+            f'--assay {self.__ASSAY__} '
+            f'--fq {fq} '
+            f'--mode {self.mode} '
+            f'--match_dir {self.col4_dict[sample]} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=1)
+
+
+    def go_assemble(self, sample):
+        step = 'go_assemble'
+        fastq_dir = f'{self.outdir_dic[sample]["split_fq"]}/fastq'
+        cmd = (
+            f'{self.__APP__} '
+            f'{self.__ASSAY__} '
+            f'{step} '
+            f'--outdir {self.outdir_dic[sample][step]} '
+            f'--sample {sample} '
+            f'--assay {self.__ASSAY__} '
+            f'--fastq_dir {fastq_dir} '
+            f'--mode {self.mode} '
+            f'--species {self.species} '
+            f'--thread {self.thread} '
+        )
+        self.process_cmd(cmd, step, sample, m=1.5 * int(self.args.thread), x=self.args.thread)
+
+
+def main():
+    multi = Multi_tracer_vdj(__ASSAY__)
+    multi.run()
+
+if __name__ == '__main__':
+    main()
+
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
new file mode 100755
index 00000000..70ada28b
--- /dev/null
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -0,0 +1,125 @@
+import pysam
+from collections import defaultdict
+import os
+import argparse
+import datetime
+import pandas as pd
+from Bio.Seq import Seq
+from glob import glob
+from celescope.tools.utils import add_log
+
+
+@add_log
+def annotation_barcodes(match_dir, mode):
+    
+    cluster_data = glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')[0]
+
+    cluster_type = pd.read_csv(cluster_data, sep='\t')
+
+    # filter barcodes
+    if mode == 'TCR':
+        clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster'])
+    elif mode == 'BCR':
+        clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster'])
+
+    tsne = glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')[0]
+    tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0)
+
+    barcodes = []
+    for cluster in clusters:
+        tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist()
+        barcodes += tmp
+    # write barcodes
+    barcodes_path = glob(f'{match_dir}/06.analysis/*_auto_assign/')[0]
+    
+    with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh:
+        for barcode in barcodes:
+            barcode = Seq(barcode)
+            barcode_reversed = barcode.reverse_complement()
+            bc = str(barcode_reversed)
+            fh.write(bc + '\n')
+
+    with open(f'{barcodes_path}/reversed_barcodes.tsv') as res:
+        res = res.readlines()
+    return res
+
+
+@add_log
+def get_fastq_to_assemble(fq_outdir, fq, barcodes):
+    """
+    split_fastq
+    """
+    if not os.path.exists(fq_outdir):
+        os.makedirs(fq_outdir)
+    
+    barcode_reads_dict = defaultdict(list)  # all barcodes from BCR vdj_dir paired with reads
+    reads_count_dict = {}  # all barcodes and reads num for each barcode
+    all_barcodes = []  # all barcodes
+    with pysam.FastxFile(fq) as fq:
+        for entry in fq:
+            attr = entry.name.split('_')
+            barcode = attr[0]
+            all_barcodes.append(barcode)
+            barcode_reads_dict[barcode].append(entry)
+        for barcode in list(barcode_reads_dict.keys()):
+            reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
+
+        
+        barcodes_for_match = []
+        for barcode in barcodes:
+                barcode = barcode.strip('\n')
+                barcodes_for_match.append(barcode)
+        barcodes_to_use = list(set(barcodes_for_match).intersection(set(all_barcodes)))
+            # barcodes in both RNA data and BCR data
+
+    barcode_reads_useful = {barcode: barcode_reads_dict[barcode] for barcode in barcodes_to_use}
+
+
+    barcodes_reads_count = {barcode: reads_count_dict[barcode] for barcode in
+                            list(barcode_reads_useful.keys())}
+
+    barcodes_reads_cal = pd.DataFrame.from_dict(barcodes_reads_count, orient='index',columns=['counts'])
+    barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'})
+    barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False)
+
+    i = 1
+    for barcode in list(barcode_reads_useful.keys()):
+
+        with open(f'{fq_outdir}/{i}.fq', 'w') as f:
+            for entry in barcode_reads_useful[barcode]:
+                f.write(str(entry) + '\n')
+        if i % 100 == 0:
+            get_fastq_to_assemble.logger.info(f'processed {i} cells')
+        i += 1
+    #stat file
+    barcodes_reads_cal.to_csv(f'{fq_outdir}/reads_count.tsv', sep='\t')
+
+    stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful))
+    with open(f'{fq_outdir}/stat.txt', 'w') as s:
+        s.write(stat_string)
+
+
+def split_fastq(args):
+    mode = args.mode
+    match_dir = args.match_dir
+    sample = args.sample
+    outdir = args.outdir
+    assay = args.assay
+    fq = args.fq
+
+    fq_outdir = f'{outdir}/fastq'
+    barcodes = annotation_barcodes(match_dir, mode)
+        
+    get_fastq_to_assemble(fq_outdir, fq, barcodes)
+
+
+def get_opts_split_fastq(parser, sub_program):
+    if sub_program:
+        parser.add_argument('--sample',help='sample name', required=True)
+        parser.add_argument('--outdir', help='output dir', required=True)
+        parser.add_argument('--assay', help='assay', required=True)
+        parser.add_argument('--fq', required=True)
+    parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    parser.add_argument('--match_dir', help='matched rna_dir')
+
+
-- 
Gitee


From 47ad52acf58f732d4db731a6ebd5eef7d0cf009c Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 27 May 2021 15:43:11 +0800
Subject: [PATCH 17/96] add tracer_vdj

---
 celescope/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index e87176ac..015f41d1 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -14,4 +14,5 @@ ASSAY_DICT = {
     'tag': 'Single Cell tag',
     'citeseq': 'Single Cell CITE-Seq',
     'tcr_fl': 'Single Cell full length TCR',
+    'tracer_vdj': 'Single Cell Full Length TCR or BCR'
 }
-- 
Gitee


From ef334a0d8e59a1409baf73113775aced62679d6b Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 28 May 2021 13:11:36 +0800
Subject: [PATCH 18/96] add tracer_vdj and fix bug

---
 celescope/templates/html/tracer_vdj/base.html | 161 ++++++++++++++++++
 celescope/tools/barcode.py                    |  14 +-
 celescope/tracer_vdj/go_assemble.py           |  28 ++-
 celescope/tracer_vdj/multi_tracer_vdj.py      |  35 +---
 celescope/tracer_vdj/split_fastq.py           |  58 ++++---
 5 files changed, 226 insertions(+), 70 deletions(-)
 create mode 100755 celescope/templates/html/tracer_vdj/base.html

diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html
new file mode 100755
index 00000000..54ad567e
--- /dev/null
+++ b/celescope/templates/html/tracer_vdj/base.html
@@ -0,0 +1,161 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta http-equiv="X-UA-Compatible" content="IE=edge,Chrome=1" />
+        <title>report</title>
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css"> 
+        <script> {% include "./js/jquery.min.js" %} </script>
+        <script> {% include "./js/plotly-latest.min.js" %} </script>
+
+        
+        <script type="text/javascript">
+            
+            function toggle1(node){
+                var helpNode = node.parentNode.parentNode.lastElementChild.firstElementChild;
+                if (helpNode.style.display=="inline"){
+                    helpNode.style.display="none";
+                } else {
+                    helpNode.style.display="inline";
+                }
+            }
+
+        </script>
+        
+        <style>
+            .box {
+                margin-top: 2px;
+                margin-bottom: 2px;
+                width: 100%;
+                background-color: rgba(255,255,255,0.5);
+                padding: 15px 14px;
+                border-radius: 5px;
+                border: 1px solid rgba(0,0,0,0.1);
+  
+            }
+
+            .box1 {
+                margin-top: 2px;
+                margin-bottom: 2px;
+                width: 100%;
+                background-color: rgba(255,255,255,0.5);
+                padding: 15px 14px;
+                border-radius: 5px;
+                border: 1px solid rgba(0,0,0,0.1);  
+            }
+
+            
+            .box1 table th{
+                text-align: left;
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            table {
+                border-collapse: collapse;
+                width: 100%;
+                margin-bottom: 1%;
+                border-spacing: 0 2em;
+            }
+
+            th, td {
+                border-bottom: 1px solid #ddd;
+                
+            }
+
+            .box table td:first-child{
+                text-align: left;
+                padding-bottom: 5px;
+            }
+
+            .box table td{
+                text-align: right;
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            .logo_container {
+                height: 50px;
+                width: 150px;
+                position: relative;
+            }
+            .description {
+                margin-top: 5px;
+            }
+            .description p{
+                text-align: left;
+                margin-top: 0px;
+                margin-bottom: 0px;
+                line-height: 28px !important;
+                font-size: 18px !important;
+            }
+            .has_desc:after {
+                position: relative;
+                background: rgba(0,0,0,0.1);
+                color: white;
+                width: 18px;
+                height: 18px;
+                border-radius: 18px;
+                cursor: pointer;
+                float: right;
+                text-align: center;
+                line-height: 20px;
+                content: '?';
+            }
+
+            .clear{ clear:both}
+
+            .js-plotly-plot .plotly .modebar {
+                position: absolute;
+                top: 2px;
+                right: 2px;
+                z-index: 1001;
+                background: rgba(255, 255, 255, 0.7) none repeat scroll 0% 0%;
+            }
+
+        </style>
+</head>
+<body>
+  <header class="navbar navbar-fixed-top" style="background: #ffffff">
+    <div class="navbar-brand">
+      {% include "html/common/logo.html"%}
+    </div>
+  </header>
+
+  <div class="container">
+      <h1 align="center"> CeleScope Report</h1>
+
+      {% if sample_summary is defined %}
+      {% include "html/common/sample_summary.html"%}
+      {% endif %}
+
+      {% if barcode_summary is defined %}
+      {% include "html/common/barcode_summary.html"%}
+      {% endif %}
+
+      {% if cutadapt_summary is defined %}
+      {% include "html/common/cutadapt_summary.html"%}
+      {% endif %}
+
+      {% if consensus_summary is defined %}
+      {% include "html/common/consensus_summary.html"%}
+      {% endif %}
+
+      {% if split_fastq is defined %}
+      {% include "html/tracer_vdj/split_fastq_summary.html"%}
+      {% endif %}
+
+      {% if go_assemble_summary is defined %}
+      {% include "html/tracer_vdj/go_assemble_summary.html"%}
+      {% endif %}      
+
+      {% if table_dict is defined %}
+      {% include "html/vdj/clonetypes_table.html"%}
+      {% endif %}
+      
+
+    </div>
+  </div>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py
index 3974a4de..1c48f6e3 100755
--- a/celescope/tools/barcode.py
+++ b/celescope/tools/barcode.py
@@ -225,12 +225,17 @@ class Barcode(Step):
         self.lowNum = args.lowNum
         self.lowQual = args.lowQual
         self.allowNoPolyT = args.allowNoPolyT
-        self.allowNoLinker = args.allowNoLinker   
+        self.allowNoLinker = args.allowNoLinker
+        self.paired_fq = args.paired_fq
+        self.new_f1 = f'{self.outdir}/{self.sample}_new_R1.fq{suffix}'
+        self.new_f2 = f'{self.outdir}/{self.sample}_new_R2.fq{suffix}'   
 
     @utils.add_log
     def run(self):
 
         fh3 = xopen(self.out_fq2, 'w')
+        new_f1 = xopen(self.new_f1, 'w')
+        new_f2 = xopen(self.new_f2, 'w')
 
         if self.nopolyT:
             fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w')
@@ -377,6 +382,12 @@ class Barcode(Step):
                 self.umi_qual_Counter.update(C_U_quals_ascii[C_len:])
 
                 fh3.write(f'@{cb}_{umi}_{self.total_num}\n{seq2}\n+\n{qual2}\n')
+
+                if self.paired_fq:
+
+                    new_f1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n')
+                    new_f2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n')
+
             Barcode.run.logger.info(self.fq1_list[i] + ' finished.')
         fh3.close()
 
@@ -487,6 +498,7 @@ def get_opts_barcode(parser, sub_program=True):
     parser.add_argument('--gzip', help="output gzipped fastq", action='store_true')
     parser.add_argument(
         '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto')
+    parser.add_argument('--paired_fq', help="output R1 R2", action='store_true')
     if sub_program:
         parser.add_argument('--fq1', help='read1 fq file', required=True)
         parser.add_argument('--fq2', help='read2 fq file', required=True)
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 65f4696a..af5c622d 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -3,7 +3,8 @@ import os
 from os import listdir
 from os.path import isfile, join
 from concurrent.futures import ProcessPoolExecutor
-from celescope.tools.utils import add_log
+from celescope.tools import utils
+from celescope.tools.utils import *
 import datetime
 
 
@@ -16,7 +17,7 @@ BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 # 开始组装
 
-
+@utils.add_log
 def bracer_summarise(outdir):
     bracer_outdir = f'{outdir}/bracer'
     cmd = (
@@ -29,7 +30,7 @@ def bracer_summarise(outdir):
     bracer_summarise.logger.info(cmd)
     os.system(cmd)
 
-
+@utils.add_log
 def bracer(fq, outdir, species):
     prefix = os.path.basename(fq).strip('.fq')
     cmd = (
@@ -48,7 +49,7 @@ def bracer(fq, outdir, species):
     bracer.logger.info(cmd)
     os.system(cmd)
 
-
+@utils.add_log
 def tracer_summarise(outdir):
     tracer_outdir = f'{outdir}/tracer'
     cmd = (
@@ -61,7 +62,7 @@ def tracer_summarise(outdir):
     tracer_summarise.logger.info(cmd)
     os.system(cmd)
 
-
+@utils.add_log
 def tracer(fq, outdir, species):
     prefix = os.path.basename(fq).strip('.fq')
     cmd = (
@@ -82,7 +83,7 @@ def tracer(fq, outdir, species):
     os.system(cmd)
 
 
-@add_log
+@utils.add_log
 def run_tracer(outdir, fastq_dir, species, thread):
 
     fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
@@ -99,7 +100,7 @@ def run_tracer(outdir, fastq_dir, species, thread):
     tracer_summarise(outdir)
 
 
-@add_log
+@utils.add_log
 def run_bracer(outdir, fastq_dir, species, thread):
     fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
     outdirs = [outdir] * len(fqs)
@@ -121,20 +122,17 @@ def go_assemble(args):
     outdir = args.outdir
     species = args.species
     
-    mode = args.mode
-    if mode == 'TCR':
+    type = args.type
+    if type == 'TCR':
         run_tracer(outdir, fastq_dir, species, thread)
-    elif mode == 'BCR':
+    elif type == 'BCR':
         run_bracer(outdir, fastq_dir, species, thread)
 
 
 def get_opts_go_assemble(parser, sub_program):
     if sub_program:
-        parser.add_argument("--outdir", help="assemble outdir", required=True)
-        parser.add_argument("--sample", help="vdj sample name", required=True)
-        parser.add_argument('--assay', help='assay', required=True)
+        parser = s_common(parser)
         parser.add_argument('--fastq_dir', required=True)
-    parser.add_argument('--mode', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+    parser.add_argument('--type', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
-    parser.add_argument('--thread', help='thread', default=20)
 
diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py
index 97beed69..8efb09f4 100755
--- a/celescope/tracer_vdj/multi_tracer_vdj.py
+++ b/celescope/tracer_vdj/multi_tracer_vdj.py
@@ -3,49 +3,28 @@ from celescope.tools.Multi import Multi
 
 
 class Multi_tracer_vdj(Multi):
-    def custome_args(self):
-        self.parser.add_argument('--thread', help='thread', default=20)
-        self.parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'])
-        self.parser.add_argument('--species', help='species name', choices=['Hsap', 'Mmus'])
-
-    def read_custome_args(self):
-        self.thread = self.args.thread
-        self.mode = self.args.mode
-        self.species = self.args.species
 
     def split_fastq(self, sample):
         step = 'split_fastq'
+        cmd_line = self.get_cmd_line(step, sample)
         fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}'
         cmd = (
-            f'{self.__APP__} '
-            f'{self.__ASSAY__} '
-            f'{step} '
-            f'--outdir {self.outdir_dic[sample][step]} '
-            f'--sample {sample} '
-            f'--assay {self.__ASSAY__} '
+            f'{cmd_line} '
             f'--fq {fq} '
-            f'--mode {self.mode} '
-            f'--match_dir {self.col4_dict[sample]} '
+            f'--match_dir {self.col4_dict[sample]}'
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
 
     def go_assemble(self, sample):
         step = 'go_assemble'
-        fastq_dir = f'{self.outdir_dic[sample]["split_fq"]}/fastq'
+        cmd_line = self.get_cmd_line(step, sample)
+        fastq_dir = f'{self.outdir_dic[sample]["split_fastq"]}/fastq'
         cmd = (
-            f'{self.__APP__} '
-            f'{self.__ASSAY__} '
-            f'{step} '
-            f'--outdir {self.outdir_dic[sample][step]} '
-            f'--sample {sample} '
-            f'--assay {self.__ASSAY__} '
+            f'{cmd_line} '
             f'--fastq_dir {fastq_dir} '
-            f'--mode {self.mode} '
-            f'--species {self.species} '
-            f'--thread {self.thread} '
         )
-        self.process_cmd(cmd, step, sample, m=1.5 * int(self.args.thread), x=self.args.thread)
+        self.process_cmd(cmd, step, sample, m=30, x=self.args.thread)
 
 
 def main():
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index 70ada28b..9b14517b 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -5,24 +5,26 @@ import argparse
 import datetime
 import pandas as pd
 from Bio.Seq import Seq
-from glob import glob
-from celescope.tools.utils import add_log
+import glob
+from celescope.tools import utils
+from celescope.tools.utils import *
 
 
-@add_log
-def annotation_barcodes(match_dir, mode):
+@utils.add_log
+def annotation_barcodes(match_dir, type):
     
-    cluster_data = glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')[0]
-
+    cluster_data = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')
+    cluster_data = cluster_data[0]
     cluster_type = pd.read_csv(cluster_data, sep='\t')
 
     # filter barcodes
-    if mode == 'TCR':
+    if type == 'TCR':
         clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster'])
-    elif mode == 'BCR':
+    elif type == 'BCR':
         clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster'])
 
-    tsne = glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')[0]
+    tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')
+    tsne = tsne[0]
     tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0)
 
     barcodes = []
@@ -30,8 +32,8 @@ def annotation_barcodes(match_dir, mode):
         tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist()
         barcodes += tmp
     # write barcodes
-    barcodes_path = glob(f'{match_dir}/06.analysis/*_auto_assign/')[0]
-    
+    barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/')
+    barcodes_path = barcodes_path[0] 
     with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh:
         for barcode in barcodes:
             barcode = Seq(barcode)
@@ -44,7 +46,7 @@ def annotation_barcodes(match_dir, mode):
     return res
 
 
-@add_log
+@utils.add_log
 def get_fastq_to_assemble(fq_outdir, fq, barcodes):
     """
     split_fastq
@@ -82,25 +84,30 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
     barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'})
     barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False)
 
+    barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
+
+    stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful))
+    with open(f'{fq_outdir}/../stat.txt', 'w') as s:
+        s.write(stat_string)
+
     i = 1
     for barcode in list(barcode_reads_useful.keys()):
 
         with open(f'{fq_outdir}/{i}.fq', 'w') as f:
             for entry in barcode_reads_useful[barcode]:
                 f.write(str(entry) + '\n')
-        if i % 100 == 0:
+        if i % 1000 == 0:
             get_fastq_to_assemble.logger.info(f'processed {i} cells')
-        i += 1
-    #stat file
-    barcodes_reads_cal.to_csv(f'{fq_outdir}/reads_count.tsv', sep='\t')
 
-    stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful))
-    with open(f'{fq_outdir}/stat.txt', 'w') as s:
-        s.write(stat_string)
+        if i == len(list(barcode_reads_useful.keys())):
+            get_fastq_to_assemble.loogger.info(f'finnaly get {i} cells')
+
+        i += 1
+        
 
 
 def split_fastq(args):
-    mode = args.mode
+    type = args.type
     match_dir = args.match_dir
     sample = args.sample
     outdir = args.outdir
@@ -108,18 +115,17 @@ def split_fastq(args):
     fq = args.fq
 
     fq_outdir = f'{outdir}/fastq'
-    barcodes = annotation_barcodes(match_dir, mode)
+    barcodes = annotation_barcodes(match_dir, type)
         
     get_fastq_to_assemble(fq_outdir, fq, barcodes)
 
 
 def get_opts_split_fastq(parser, sub_program):
     if sub_program:
-        parser.add_argument('--sample',help='sample name', required=True)
-        parser.add_argument('--outdir', help='output dir', required=True)
-        parser.add_argument('--assay', help='assay', required=True)
+        parser = s_common(parser)
         parser.add_argument('--fq', required=True)
-    parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
-    parser.add_argument('--match_dir', help='matched rna_dir')
+        parser.add_argument('--match_dir', help='matched rna_dir')
+    parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    
 
 
-- 
Gitee


From ecfbaa7de7a73b09b82e6c3eafee977a8973f4e8 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 28 May 2021 19:20:27 +0800
Subject: [PATCH 19/96] add vdj_sum to filter results and summarise

---
 celescope/tracer_vdj/__init__.py         |   3 +-
 celescope/tracer_vdj/go_assemble.py      |   8 +-
 celescope/tracer_vdj/multi_tracer_vdj.py |   9 +
 celescope/tracer_vdj/split_fastq.py      |   2 +-
 celescope/tracer_vdj/vdj_sum.py          | 214 +++++++++++++++++++++++
 5 files changed, 230 insertions(+), 6 deletions(-)
 create mode 100644 celescope/tracer_vdj/vdj_sum.py

diff --git a/celescope/tracer_vdj/__init__.py b/celescope/tracer_vdj/__init__.py
index eb48cf9f..cdeab6e0 100644
--- a/celescope/tracer_vdj/__init__.py
+++ b/celescope/tracer_vdj/__init__.py
@@ -3,5 +3,6 @@ __STEPS__ = [
     'barcode',
     'cutadapt',
     'split_fastq',
-    'go_assemble']
+    'go_assemble',
+    'vdj_sum']
 __ASSAY__ = 'tracer_vdj'
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index af5c622d..6f276d23 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -17,7 +17,7 @@ BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 # 开始组装
 
-@utils.add_log
+
 def bracer_summarise(outdir):
     bracer_outdir = f'{outdir}/bracer'
     cmd = (
@@ -30,7 +30,7 @@ def bracer_summarise(outdir):
     bracer_summarise.logger.info(cmd)
     os.system(cmd)
 
-@utils.add_log
+
 def bracer(fq, outdir, species):
     prefix = os.path.basename(fq).strip('.fq')
     cmd = (
@@ -49,7 +49,7 @@ def bracer(fq, outdir, species):
     bracer.logger.info(cmd)
     os.system(cmd)
 
-@utils.add_log
+
 def tracer_summarise(outdir):
     tracer_outdir = f'{outdir}/tracer'
     cmd = (
@@ -62,7 +62,7 @@ def tracer_summarise(outdir):
     tracer_summarise.logger.info(cmd)
     os.system(cmd)
 
-@utils.add_log
+
 def tracer(fq, outdir, species):
     prefix = os.path.basename(fq).strip('.fq')
     cmd = (
diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py
index 8efb09f4..173b1e52 100755
--- a/celescope/tracer_vdj/multi_tracer_vdj.py
+++ b/celescope/tracer_vdj/multi_tracer_vdj.py
@@ -26,6 +26,15 @@ class Multi_tracer_vdj(Multi):
         )
         self.process_cmd(cmd, step, sample, m=30, x=self.args.thread)
 
+    def vdj_sum(self, sample):
+        step = 'vdj_sum'
+        cmd_line = self.get_cmd_line(step, sample)
+        ass_dir = f'{self.outdir_dic[sample]["go_assemble"]}'
+        cmd = (
+            f'{cmd_line} '
+            f'--ass_dir {ass_dir} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=2)
 
 def main():
     multi = Multi_tracer_vdj(__ASSAY__)
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index 9b14517b..024c295a 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -86,7 +86,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
 
     barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
 
-    stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful))
+    stat_string = 'All cells:{}\nmatched cell:{}\n'.format(len(all_barcodes), len(barcode_reads_useful))
     with open(f'{fq_outdir}/../stat.txt', 'w') as s:
         s.write(stat_string)
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
new file mode 100644
index 00000000..162e5f0c
--- /dev/null
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -0,0 +1,214 @@
+import pysam
+from collections import defaultdict
+import os
+import argparse
+import datetime
+import pandas as pd
+from Bio.Seq import Seq
+import glob
+from celescope.tools import utils
+from celescope.tools.utils import *
+
+
+def tpm_count(ass_dir):
+	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recom' # ass_dir outdir/sample/04.go_assemble
+					f'binants.txt', sep='\t')
+	productive = rec[rec['productive'] == True]
+	productive['TPM'] = ''
+	indx = list(productive.index)
+	for i in indx:
+		cell_name = productive.at[i, 'cell_name']
+		rec_id = productive.at[i, 'recombinant_id']
+		with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf:
+			for line in tsvf:
+				if rec_id in line:
+					line = line.rstrip()
+					line = line.split('\t')
+					tpm = float(line[4])
+					productive.loc[i, 'TPM'] = tpm
+	
+	return productive
+
+
+def filtering(type, ass_dir, sum_dir):
+
+	if not os.path.exists(sum_dir):
+		os.makedirs(sum_dir)
+
+	if type == 'TCR':
+		data = tpm_count(ass_dir)
+		cell_name = set(list(data['cell_name']))
+		filtered = pd.DataFrame()
+		for name in cell_name:
+			count_data = data[data['cell_name'] == name]
+			tra = count_data[count_data['locus'] == 'A']
+			trb = count_data[count_data['locus'] == 'B']
+			if tra.empty is not True:
+				tra = tra.sort_values(by='TPM', ascending=False)
+				tra = tra.head(1)
+				filtered = filtered.append(tra, ignore_index=True)
+			if trb.empty is not True:
+				trb = trb.sort_values(by='TPM', ascending=False)
+				trb = trb.head(1)
+				filtered = filtered.append(trb, ignore_index=True)
+		filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t')
+
+	elif type == 'BCR':
+
+		data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
+		data = data[data['FUNCTIONAL'] == True]
+		cell_name = set(list(data['CELL']))
+		filtered = pd.DataFrame()
+		for name in cell_name:
+			count_cell = data[data['CELL'] == name]
+			count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H'])
+			count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K'])
+			count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L'])
+			count_k_l = count_k.append(count_l)
+			if count_h.empty is not True:
+				count_h = count_h.sort_values(by='TPM', ascending=False)
+				count_h = count_h.head(1)
+				filtered = filtered.append(count_h, ignore_index=True)
+			if count_k_l.empty is not True:
+				count_k_l = count_k_l.sort_values(by='TPM', ascending=False)
+				count_k_l = count_k_l.head(1)
+				filtered = filtered.append(count_k_l, ignore_index=True)
+
+		filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t')
+
+	return filtered
+
+
+def res_sum(type, ass_dir, sum_dir):
+	filtered = filtering(type, ass_dir, sum_dir)
+
+	if type == 'TCR':
+		count_a = filtered[filtered['locus'] == 'A'].shape[0]
+		count_b = filtered[filtered['locus'] == 'B'].shape[0]
+		paired_cell = pd.DataFrame(filtered['cell_name'].value_counts())
+		productive_cells = paired_cell.shape[0]
+		unpaired_cell = paired_cell[paired_cell['cell_name'] == 1]
+		paired_cell = paired_cell[paired_cell['cell_name'] == 2]
+		paired_cell = list(paired_cell.index)
+		string1 = f'productive TRA:\t{count_a}/{productive_cells}\nproductive TRB:\t{count_b}/{productive_cells}\npaired TRA and TRB:\t{len(paired_cell)}/{productive_cells}\n'
+
+		with open(f'{sum_dir}/stat.txt', 'w') as fh:
+			fh.write(string1)
+
+		aaseqs = []
+		for cell in paired_cell:
+			temp = filtered[filtered['cell_name'] == cell]
+			temp_loci = list(temp['locus'])
+			temp_aaseq = list(temp['CDR3aa'])
+			string = 'TR{}:C{}F;TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0], temp_loci[1], temp_aaseq[1])
+			aaseqs.append(string)
+
+		for cell in list(unpaired_cell.index):
+			temp = filtered[filtered['cell_name'] == cell]
+			temp_loci = list(temp['locus'])
+			temp_aaseq = list(temp['CDR3aa'])
+			string = 'TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0])
+			aaseqs.append(string)
+
+		per_count_data = pd.DataFrame()
+		per_count_data['cdr3s_aa'] = aaseqs
+		clone_count = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts())
+		clone_count.columns = ["frequency"]
+		proportation = []
+		sum = clone_count['frequency'].sum()
+		for f in list(clone_count['frequency']):
+			p = f/sum
+			proportation.append(p)
+		clone_count['proportation'] = proportation
+		clone_count = clone_count.reset_index()
+		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
+		clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t')
+
+	elif type == 'BCR':
+		filtered_h = filtered[filtered['LOCUS'] == 'H']
+		filtered_k = filtered[filtered['LOCUS'] == 'K']
+		filtered_l = filtered[filtered['LOCUS'] == 'L']
+		filtered_h_count = filtered_h.shape[0]
+		filtered_k_count = filtered_k.shape[0]
+		filtered_l_count = filtered_l.shape[0]
+
+		paired_cell = pd.DataFrame(filtered['CELL'].value_counts())
+		productive_cells = paired_cell.shape[0]	
+
+		paired_cell = pd.DataFrame(filtered['CELL'].value_counts())
+		productive_cells = paired_cell.shape[0]
+		unpaired_cell = paired_cell[paired_cell['CELL'] == 1]
+		paired_cell = paired_cell[paired_cell['CELL'] == 2]
+		paired_k = 0
+		paired_l = 0
+
+		clones = pd.DataFrame()
+		cells = list(paired_cell.index)
+		aaseqs = []
+
+		for cell in cells:
+			if 'K' in list(filtered[filtered['CELL'] == cell]['LOCUS']):
+				paired_k += 1
+			elif 'L' in list(filtered[filtered['CELL'] == cell]['LOCUS']):
+				paired_l += 1
+			tep = filtered[filtered['CELL'] == cell]
+			tep_loci = list(tep['LOCUS'])
+			cdr3 = list(tep['JUNCTION'])
+			aaseq = []
+			for seq in cdr3:
+				seq = Seq(seq)
+				seq = seq.translate()
+				aaseq.append(seq)
+			string = 'IG{}:{};IG{}:{}'.format(tep_loci[0], aaseq[0], tep_loci[1], aaseq[1])
+			aaseqs.append(string)
+
+		for cell in list(unpaired_cell.index):
+			cells.append(cell)
+			locus = list(filtered[filtered['CELL'] == cell]['LOCUS'])
+			cdr3 = list(filtered[filtered['CELL'] == cell]['JUNCTION'])
+			seq = Seq(cdr3[0])
+			seq = seq.translate()
+			string = 'IG{}:{}'.format(locus[0], seq)
+			aaseqs.append(string)
+
+		clones['CELLS'] = cells
+
+		clones["cdr3s_aa"] = aaseqs
+		clone_count = pd.DataFrame(clones['cdr3s_aa'].value_counts())
+		clone_count.columns = ["frequency"]
+		proportation = []
+		sum = clone_count['frequency'].sum()
+		for f in list(clone_count['frequency']):
+			p = f/sum
+			proportation.append(p)
+		clone_count['proportation'] = proportation
+		clone_count = clone_count.reset_index()
+		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
+		clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t')
+
+		stat_string_1 = f"BCR_H reconstruction:\t{filtered_h_count}/{productive_cells}\nBCR_K reconstruction:\t{filtered_k_count}/{productive_cells}\nBCR_L reconstruction:\t{filtered_l_count}/{productive_cells}\n"
+		
+		stat_string_2 = "Paired HK productive reconstruction:\t{}/{}\nPaired HL productive reconstruction:\t{}/{}\n".format(paired_k, productive_cells, paired_l, productive_cells)
+
+		with open(f'{sum_dir}/stat.txt', 'w') as s:
+			s.write(stat_string_1)
+			s.write(stat_string_2)
+
+@utils.add_log					
+def vdj_sum(args):
+	type = args.type
+	ass_dir = args.ass_dir
+	sample = args.sample
+	outdir = args.outdir
+	
+	res_sum(type, ass_dir, outdir)
+
+
+def get_opts_vdj_sum(parser, sub_program):
+	if sub_program:
+		parser = s_common(parser)
+		parser.add_argument('--ass_dir', help='assemble dir', required=True)
+	parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+
+
+
-- 
Gitee


From f5ce79bbdbffb567e23eada0ea7e5bb666f9eb98 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 1 Jun 2021 14:40:37 +0800
Subject: [PATCH 20/96] updata stat

---
 celescope/tracer_vdj/split_fastq.py | 12 ++++++++++--
 celescope/tracer_vdj/vdj_sum.py     |  2 +-
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index 024c295a..0c5a0223 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -55,17 +55,25 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
         os.makedirs(fq_outdir)
     
     barcode_reads_dict = defaultdict(list)  # all barcodes from BCR vdj_dir paired with reads
+    umi_count = defaultdict()
     reads_count_dict = {}  # all barcodes and reads num for each barcode
     all_barcodes = []  # all barcodes
     with pysam.FastxFile(fq) as fq:
         for entry in fq:
             attr = entry.name.split('_')
             barcode = attr[0]
+            umi = attr[1]
+            umi_count[barcode][umi] += 1
             all_barcodes.append(barcode)
             barcode_reads_dict[barcode].append(entry)
         for barcode in list(barcode_reads_dict.keys()):
             reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
+    
+        umi_count_df = pd.DataFrame([(k, list(v.keys())[0], list(v.values())[0]) for k, v in umi_count.items()], columns=['Barcode', 'umi', 'umi_reads_count'])
+
+        umi_df = umi_count_df.groupby(['Barcode']).agg({'UMI': 'count'})
 
+        umi_df.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
         
         barcodes_for_match = []
         for barcode in barcodes:
@@ -86,7 +94,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
 
     barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
 
-    stat_string = 'All cells:{}\nmatched cell:{}\n'.format(len(all_barcodes), len(barcode_reads_useful))
+    stat_string = 'All_cells:\t{}\nmatched_cell:\t{}\n'.format(len(all_barcodes), len(barcode_reads_useful))
     with open(f'{fq_outdir}/../stat.txt', 'w') as s:
         s.write(stat_string)
 
@@ -100,7 +108,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
             get_fastq_to_assemble.logger.info(f'processed {i} cells')
 
         if i == len(list(barcode_reads_useful.keys())):
-            get_fastq_to_assemble.loogger.info(f'finnaly get {i} cells')
+            get_fastq_to_assemble.logger.info(f'finnaly get {i} cells')
 
         i += 1
         
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 162e5f0c..f2020e0a 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -90,7 +90,7 @@ def res_sum(type, ass_dir, sum_dir):
 		unpaired_cell = paired_cell[paired_cell['cell_name'] == 1]
 		paired_cell = paired_cell[paired_cell['cell_name'] == 2]
 		paired_cell = list(paired_cell.index)
-		string1 = f'productive TRA:\t{count_a}/{productive_cells}\nproductive TRB:\t{count_b}/{productive_cells}\npaired TRA and TRB:\t{len(paired_cell)}/{productive_cells}\n'
+		string1 = f'productive_TRA:\t{count_a}/{productive_cells}\nproductive_TRB:\t{count_b}/{productive_cells}\npaired_TRA_and_TRB:\t{len(paired_cell)}/{productive_cells}\n'
 
 		with open(f'{sum_dir}/stat.txt', 'w') as fh:
 			fh.write(string1)
-- 
Gitee


From 3119aec0399e8f90f008b45044e98dc35cf7b53a Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Wed, 2 Jun 2021 19:14:37 +0800
Subject: [PATCH 21/96] update stat.txt and rewrite split_fq

---
 celescope/tracer_vdj/go_assemble.py      | 112 ++++++++++++++++++++++-
 celescope/tracer_vdj/multi_tracer_vdj.py |   4 +
 celescope/tracer_vdj/split_fastq.py      |  67 ++++++--------
 celescope/tracer_vdj/vdj_sum.py          |  68 +++++++++-----
 4 files changed, 188 insertions(+), 63 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 6f276d23..fa4b70fd 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -6,6 +6,8 @@ from concurrent.futures import ProcessPoolExecutor
 from celescope.tools import utils
 from celescope.tools.utils import *
 import datetime
+import glob
+import pysam
 
 
 TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer'
@@ -27,7 +29,6 @@ def bracer_summarise(outdir):
         f'--no_networks ' 
         f'{bracer_outdir} '
         )
-    bracer_summarise.logger.info(cmd)
     os.system(cmd)
 
 
@@ -40,13 +41,13 @@ def bracer(fq, outdir, species):
         f'--fragment_sd 5 '
         f'--single_end '
         f'--small_index '
+        f'--no_trimming '
         f'--species {species} '
         f'-c {BRACER_CONF} '
         f'{prefix} '
         f'{outdir}/bracer '
         f'{fq} '
     )
-    bracer.logger.info(cmd)
     os.system(cmd)
 
 
@@ -59,7 +60,6 @@ def tracer_summarise(outdir):
         f'--no_networks '
         f'{tracer_outdir} '
     )
-    tracer_summarise.logger.info(cmd)
     os.system(cmd)
 
 
@@ -79,7 +79,6 @@ def tracer(fq, outdir, species):
         f'{prefix} '
         f'{outdir}/tracer '
     )
-    tracer.logger.info(cmd)
     os.system(cmd)
 
 
@@ -116,6 +115,111 @@ def run_bracer(outdir, fastq_dir, species, thread):
     bracer_summarise(outdir)
 
 
+################def get_reads_count(fq):
+#    with pysam.FastxFile(fq) as fh:
+#        count = 0
+#        for entry in fh:
+#            count += 1
+#    return count
+
+
+def get_umi_count(fq):
+    umis = []
+    with pysam.FastxFile(fq) as fh:
+        for entry in fh:
+            attr = entry.name.split('_')
+            barcode = attr[0]
+            umi = attr[1]
+            umis.append(umi)
+    res = len(set(umis))
+    return res
+
+
+def get_assemble_stat(outdir, type):
+
+    total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv'
+    UMIs = pd.DataFrame(total_fq, sep='\t')
+
+    all_UMIs = UMIs['UMIs_count'].sum()
+    stat_file = outdir + '/../04.go_assemble/stat.txt'
+
+    if type == 'TCR':
+        TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
+        TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
+        TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
+        TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
+        TRA_UMIs_count = sum(TRA_UMIs)
+        TRA_ = format(TRA_UMIs_count, ',')
+        TRB_UMIs_count = sum(TRB_UMIs)
+        TRB_ = format(TRB_UMIs_count, ',')
+
+        TRA_mapping = TRA_UMIs_count/all_UMIs
+        TRA_mapping = round(TRA_mapping, 4)
+        TRA_mapping = f'{TRA_}({TRA_mapping})'
+
+        TRB_mapping = TRB_UMIs_count/all_UMIs
+        TRB_mapping = round(TRB_mapping, 4)
+        TRB_mapping = f'{TRB_}({TRB_mapping})'
+
+        total_counts = TRA_UMIs_count + TRB_UMIs_count
+        total_ = format(total_counts, ',')
+        total_mapping = (total_counts)/all_UMIs
+        total_mapping = round(total_mapping, 4)
+        total_mapping = f'{total_}({total_mapping})'
+
+        stat_text = pd.DataFrame({
+            'item': ['UMIs mapped to TRA or TRB', 'UMIs mapped to TRA', 'UMIs mapped to TRB'], 'count': [total_mapping, TRA_mapping, TRB_mapping]
+        }, columns=['item', 'count'])
+
+        stat_text.to_csv(stat_file, sep=':', header=None, index=False)
+
+    elif type == 'BCR':
+        IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
+        IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
+        IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
+
+        IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
+        IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
+        IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
+
+
+        IGH = sum(IGH_UMIs)
+        IGH_ = format(IGH, ',')
+        IGK = sum(IGK_UMIs)
+        IGK_ = format(IGK, ',')
+        IGL = sum(IGL_UMIs)
+        IGL_ = format(IGL, ',')
+
+        IGH_mapping = IGH/all_UMIs
+        IGH_mapping = round(IGH_mapping, 4)
+        IGH_mapping = f'{IGH_}({IGH_mapping})'
+
+        IGK_mapping = IGK/all_UMIs
+        IGK_mapping = round(IGK_mapping, 4)
+        IGK_mapping = f'{IGK_}({IGK_mapping})'
+
+        IGL_mapping = IGL/all_UMIs
+        IGL_mapping = round(IGL_mapping, 4)
+        IGL_mapping = f'{IGL_}({IGL_mapping})'
+
+        total_counts = IGH + IGK + IGL
+        total_ = format(total_counts, ',')
+
+        total_mapping = (total_counts)/all_UMIs
+        total_mapping = round(total_mapping, 4)
+        total_mapping = f'{total_}({total_mapping})'
+
+        stat_text = pd.DataFrame({
+            'item': ['UMIs mapped to IGH, IGK or IGL', 'UMIs mapped to IGH', 'UMIs mapped to IGK', 'UMIs mapped to IGL'], 'count': [total_mapping, IGH_mapping, IGK_mapping, IGL_mapping]
+        })
+
+        stat_text.to_csv(stat_file, sep=':', header=None, index=False)
+
+
+
+
+
+
 def go_assemble(args):
     thread = int(args.thread)
     fastq_dir = args.fastq_dir
diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py
index 173b1e52..18ee9f98 100755
--- a/celescope/tracer_vdj/multi_tracer_vdj.py
+++ b/celescope/tracer_vdj/multi_tracer_vdj.py
@@ -30,9 +30,13 @@ class Multi_tracer_vdj(Multi):
         step = 'vdj_sum'
         cmd_line = self.get_cmd_line(step, sample)
         ass_dir = f'{self.outdir_dic[sample]["go_assemble"]}'
+
+        fastq_dir = f'{self.outdir_dic[sample]["split_fastq"]}/fastq' 
+
         cmd = (
             f'{cmd_line} '
             f'--ass_dir {ass_dir} '
+            f'--fastq_dir {fastq_dir} '
         )
         self.process_cmd(cmd, step, sample, m=5, x=2)
 
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index 0c5a0223..699be4c5 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -7,7 +7,7 @@ import pandas as pd
 from Bio.Seq import Seq
 import glob
 from celescope.tools import utils
-from celescope.tools.utils import *
+from celescope.tools.Step import Step, s_common
 
 
 @utils.add_log
@@ -33,16 +33,17 @@ def annotation_barcodes(match_dir, type):
         barcodes += tmp
     # write barcodes
     barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/')
-    barcodes_path = barcodes_path[0] 
+    barcodes_path = barcodes_path[0]
+
+    res = [] 
     with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh:
         for barcode in barcodes:
             barcode = Seq(barcode)
             barcode_reversed = barcode.reverse_complement()
             bc = str(barcode_reversed)
+            res.append(bc)
             fh.write(bc + '\n')
 
-    with open(f'{barcodes_path}/reversed_barcodes.tsv') as res:
-        res = res.readlines()
     return res
 
 
@@ -55,62 +56,54 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
         os.makedirs(fq_outdir)
     
     barcode_reads_dict = defaultdict(list)  # all barcodes from BCR vdj_dir paired with reads
-    umi_count = defaultdict()
+    # umi_count = defaultdict(list)
     reads_count_dict = {}  # all barcodes and reads num for each barcode
-    all_barcodes = []  # all barcodes
+    umi_count_dict = defaultdict(list)
+    umi_count = {}
     with pysam.FastxFile(fq) as fq:
         for entry in fq:
             attr = entry.name.split('_')
             barcode = attr[0]
             umi = attr[1]
-            umi_count[barcode][umi] += 1
-            all_barcodes.append(barcode)
-            barcode_reads_dict[barcode].append(entry)
-        for barcode in list(barcode_reads_dict.keys()):
+            if barcode in barcodes:
+                barcode_reads_dict[barcode].append(entry)
+                if umi_count_dict[barcode].count(umi) == 0:
+                    umi_count_dict[barcode].append(umi)
+        for barcode in barcodes:
             reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
-    
-        umi_count_df = pd.DataFrame([(k, list(v.keys())[0], list(v.values())[0]) for k, v in umi_count.items()], columns=['Barcode', 'umi', 'umi_reads_count'])
 
-        umi_df = umi_count_df.groupby(['Barcode']).agg({'UMI': 'count'})
-
-        umi_df.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
-        
-        barcodes_for_match = []
-        for barcode in barcodes:
-                barcode = barcode.strip('\n')
-                barcodes_for_match.append(barcode)
-        barcodes_to_use = list(set(barcodes_for_match).intersection(set(all_barcodes)))
-            # barcodes in both RNA data and BCR data
+            umi_count[barcode] = len(umi_count_dict[barcode])
 
-    barcode_reads_useful = {barcode: barcode_reads_dict[barcode] for barcode in barcodes_to_use}
+    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMIs_count'])
+    df_umi = df_umi.reset_index().rename(columns={'index': 'barcode'})
 
+    reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['reads_count'])
+    reads_count = reads_count.reset_index().rename(columns={'index': 'barcode'})
 
-    barcodes_reads_count = {barcode: reads_count_dict[barcode] for barcode in
-                            list(barcode_reads_useful.keys())}
+    df_f = pd.merge(reads_count, df_umi, on='barcode', how='inner')
 
-    barcodes_reads_cal = pd.DataFrame.from_dict(barcodes_reads_count, orient='index',columns=['counts'])
-    barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'})
-    barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False)
+    df_f = df_f.set_index('barcode')
 
-    barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
+    i = 1
 
-    stat_string = 'All_cells:\t{}\nmatched_cell:\t{}\n'.format(len(all_barcodes), len(barcode_reads_useful))
-    with open(f'{fq_outdir}/../stat.txt', 'w') as s:
-        s.write(stat_string)
+    for barcode in barcodes:
 
-    i = 1
-    for barcode in list(barcode_reads_useful.keys()):
+        df_f.loc[barcode, 'cell_name'] = i
 
         with open(f'{fq_outdir}/{i}.fq', 'w') as f:
-            for entry in barcode_reads_useful[barcode]:
+            for entry in barcode_reads_dict[barcode]:
                 f.write(str(entry) + '\n')
+
         if i % 1000 == 0:
             get_fastq_to_assemble.logger.info(f'processed {i} cells')
 
-        if i == len(list(barcode_reads_useful.keys())):
-            get_fastq_to_assemble.logger.info(f'finnaly get {i} cells')
+        if i == len(barcodes):
+            get_fastq_to_assemble.logger.info(f'finally get {i} cells')
 
         i += 1
+    
+    df_f = df_f.astype(int)
+    df_f.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
         
 
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index f2020e0a..803e97ab 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -7,7 +7,9 @@ import pandas as pd
 from Bio.Seq import Seq
 import glob
 from celescope.tools import utils
-from celescope.tools.utils import *
+from celescope.tools.Step import Step, s_common
+import glob
+
 
 
 def tpm_count(ass_dir):
@@ -30,10 +32,10 @@ def tpm_count(ass_dir):
 	return productive
 
 
-def filtering(type, ass_dir, sum_dir):
+def filtering(type, ass_dir, outdir):
 
-	if not os.path.exists(sum_dir):
-		os.makedirs(sum_dir)
+	if not os.path.exists(outdir):
+		os.makedirs(outdir)
 
 	if type == 'TCR':
 		data = tpm_count(ass_dir)
@@ -51,7 +53,7 @@ def filtering(type, ass_dir, sum_dir):
 				trb = trb.sort_values(by='TPM', ascending=False)
 				trb = trb.head(1)
 				filtered = filtered.append(trb, ignore_index=True)
-		filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t')
+		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
 	elif type == 'BCR':
 
@@ -74,13 +76,13 @@ def filtering(type, ass_dir, sum_dir):
 				count_k_l = count_k_l.head(1)
 				filtered = filtered.append(count_k_l, ignore_index=True)
 
-		filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t')
+		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
 	return filtered
 
 
-def res_sum(type, ass_dir, sum_dir):
-	filtered = filtering(type, ass_dir, sum_dir)
+def res_sum(type, ass_dir, outdir):
+	filtered = filtering(type, ass_dir, outdir)
 
 	if type == 'TCR':
 		count_a = filtered[filtered['locus'] == 'A'].shape[0]
@@ -90,10 +92,6 @@ def res_sum(type, ass_dir, sum_dir):
 		unpaired_cell = paired_cell[paired_cell['cell_name'] == 1]
 		paired_cell = paired_cell[paired_cell['cell_name'] == 2]
 		paired_cell = list(paired_cell.index)
-		string1 = f'productive_TRA:\t{count_a}/{productive_cells}\nproductive_TRB:\t{count_b}/{productive_cells}\npaired_TRA_and_TRB:\t{len(paired_cell)}/{productive_cells}\n'
-
-		with open(f'{sum_dir}/stat.txt', 'w') as fh:
-			fh.write(string1)
 
 		aaseqs = []
 		for cell in paired_cell:
@@ -122,7 +120,9 @@ def res_sum(type, ass_dir, sum_dir):
 		clone_count['proportation'] = proportation
 		clone_count = clone_count.reset_index()
 		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t')
+		clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t')
+
+		return productive_cells, count_a, count_b, paired_cell
 
 	elif type == 'BCR':
 		filtered_h = filtered[filtered['LOCUS'] == 'H']
@@ -184,15 +184,37 @@ def res_sum(type, ass_dir, sum_dir):
 		clone_count['proportation'] = proportation
 		clone_count = clone_count.reset_index()
 		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t')
+		clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t')
 
-		stat_string_1 = f"BCR_H reconstruction:\t{filtered_h_count}/{productive_cells}\nBCR_K reconstruction:\t{filtered_k_count}/{productive_cells}\nBCR_L reconstruction:\t{filtered_l_count}/{productive_cells}\n"
-		
-		stat_string_2 = "Paired HK productive reconstruction:\t{}/{}\nPaired HL productive reconstruction:\t{}/{}\n".format(paired_k, productive_cells, paired_l, productive_cells)
+		return productive_cells, filtered_h_count, filtered_k_count, filtered_l_count, paired_k, paired_l
+
+
+
+def get_stat(fastq_dir, ass_dir, outdir, type):
+	fqs = glob.glob(f'{fastq_dir}/*.fq')
+	matched_bcs = len(fqs)
+
+	stat_file = outdir + '/stat.txt'
+	if type == 'TCR':
+		productive_cells, TRA_num, TRB_num, paired_num = res_sum(type, ass_dir, outdir)
+
+		stat_text = pd.DataFrame({
+			'item': ['Matched cells', 'Productive cells', 'Cells with TRA', 'Cells with TRB', 'Cells with paired TRA and TRB'], 
+			'count': [matched_bcs, productive_cells, TRA_num, TRB_num, paired_num]
+		}, 
+		columns=['item', 'count'])
+		stat_text.to_csv(stat_file, sep=':', header=None, index=False)
+
+	elif type == 'BCR':
+		productive_cells, H_num, K_num, L_num, H_K_num, H_L_num = res_sum(type, ass_dir,outdir)
+
+		stat_text = pd.DataFrame({
+			'item': ['Matched cells', 'Productive cells', 'Cells with IGH', 'Cells with IGK', 'Cells with IGL', 'Cells with IGH and IGK', 'Cells with IGH and IGL'],
+			'count': [matched_bcs, productive_cells, H_num, K_num, L_num, H_K_num, H_L_num]
+		}, 
+		columns=['item', 'count'])
+		stat_text.to_csv(stat_file, sep=":", header=None, index=False)
 
-		with open(f'{sum_dir}/stat.txt', 'w') as s:
-			s.write(stat_string_1)
-			s.write(stat_string_2)
 
 @utils.add_log					
 def vdj_sum(args):
@@ -200,14 +222,16 @@ def vdj_sum(args):
 	ass_dir = args.ass_dir
 	sample = args.sample
 	outdir = args.outdir
-	
-	res_sum(type, ass_dir, outdir)
+	fastq_dir = args.fastq_dir
+
+	get_stat(fastq_dir, ass_dir, outdir, type)
 
 
 def get_opts_vdj_sum(parser, sub_program):
 	if sub_program:
 		parser = s_common(parser)
 		parser.add_argument('--ass_dir', help='assemble dir', required=True)
+		parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
 	parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
 
 
-- 
Gitee


From ade04c76c41c86a99e9efd3caca3f680ab07d88e Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 3 Jun 2021 16:59:38 +0800
Subject: [PATCH 22/96] generate stat

---
 celescope/tracer_vdj/go_assemble.py | 104 +++++-----
 celescope/tracer_vdj/vdj_sum.py     | 288 ++++++++++++++++++++++------
 2 files changed, 290 insertions(+), 102 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index fa4b70fd..cc8912a3 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -8,6 +8,8 @@ from celescope.tools.utils import *
 import datetime
 import glob
 import pysam
+import numpy as np
+from celescope.tools.Step import Step, s_common
 
 
 TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer'
@@ -135,43 +137,46 @@ def get_umi_count(fq):
     return res
 
 
-def get_assemble_stat(outdir, type):
+def go_assemble_summary(outdir, type):
 
     total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv'
-    UMIs = pd.DataFrame(total_fq, sep='\t')
+    UMIs = pd.read_csv(total_fq, sep='\t')
 
-    all_UMIs = UMIs['UMIs_count'].sum()
+    all_UMIs = UMIs['UMIs_count'].tolist()
+    medians = int(np.median(all_UMIs))
+    all_UMIs = sum(all_UMIs)
+    
     stat_file = outdir + '/../04.go_assemble/stat.txt'
 
+    go_assemble_summary = []
+
     if type == 'TCR':
         TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
         TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
         TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
         TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
         TRA_UMIs_count = sum(TRA_UMIs)
-        TRA_ = format(TRA_UMIs_count, ',')
+        medianA = int(np.median(TRA_UMIs))
         TRB_UMIs_count = sum(TRB_UMIs)
-        TRB_ = format(TRB_UMIs_count, ',')
-
-        TRA_mapping = TRA_UMIs_count/all_UMIs
-        TRA_mapping = round(TRA_mapping, 4)
-        TRA_mapping = f'{TRA_}({TRA_mapping})'
-
-        TRB_mapping = TRB_UMIs_count/all_UMIs
-        TRB_mapping = round(TRB_mapping, 4)
-        TRB_mapping = f'{TRB_}({TRB_mapping})'
-
+        medianB = int(np.median(TRB_UMIs))
         total_counts = TRA_UMIs_count + TRB_UMIs_count
-        total_ = format(total_counts, ',')
-        total_mapping = (total_counts)/all_UMIs
-        total_mapping = round(total_mapping, 4)
-        total_mapping = f'{total_}({total_mapping})'
 
-        stat_text = pd.DataFrame({
-            'item': ['UMIs mapped to TRA or TRB', 'UMIs mapped to TRA', 'UMIs mapped to TRB'], 'count': [total_mapping, TRA_mapping, TRB_mapping]
-        }, columns=['item', 'count'])
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to TRA',
+            'count': TRA_UMIs_count,
+            'total_count': total_counts,
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to TRB',
+            'count': TRB_UMIs_count,
+            'total_count': total_counts,
+        })
 
-        stat_text.to_csv(stat_file, sep=':', header=None, index=False)
+        with open(f'{outdir}/tmp.txt', 'w') as f:
+            f.write(f'Madian UMIs per cell:{medians}\n')
+            f.write(f'Median TRA UMIs per cell:{medianA}\n')
+            f.write(f'Median TRB UMIs per cell:{medianB}\n')
 
     elif type == 'BCR':
         IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
@@ -182,45 +187,47 @@ def get_assemble_stat(outdir, type):
         IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
         IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
 
-
         IGH = sum(IGH_UMIs)
-        IGH_ = format(IGH, ',')
+        medianH = np.median(IGH_UMIs)
         IGK = sum(IGK_UMIs)
-        IGK_ = format(IGK, ',')
+        medianK = np.median(IGK_UMIs)
         IGL = sum(IGL_UMIs)
-        IGL_ = format(IGL, ',')
-
-        IGH_mapping = IGH/all_UMIs
-        IGH_mapping = round(IGH_mapping, 4)
-        IGH_mapping = f'{IGH_}({IGH_mapping})'
-
-        IGK_mapping = IGK/all_UMIs
-        IGK_mapping = round(IGK_mapping, 4)
-        IGK_mapping = f'{IGK_}({IGK_mapping})'
-
-        IGL_mapping = IGL/all_UMIs
-        IGL_mapping = round(IGL_mapping, 4)
-        IGL_mapping = f'{IGL_}({IGL_mapping})'
+        medianL = np.median(IGL_UMIs)
 
         total_counts = IGH + IGK + IGL
-        total_ = format(total_counts, ',')
-
-        total_mapping = (total_counts)/all_UMIs
-        total_mapping = round(total_mapping, 4)
-        total_mapping = f'{total_}({total_mapping})'
 
-        stat_text = pd.DataFrame({
-            'item': ['UMIs mapped to IGH, IGK or IGL', 'UMIs mapped to IGH', 'UMIs mapped to IGK', 'UMIs mapped to IGL'], 'count': [total_mapping, IGH_mapping, IGK_mapping, IGL_mapping]
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGH',
+            'count': IGH,
+            'total_count': total_counts,
         })
 
-        stat_text.to_csv(stat_file, sep=':', header=None, index=False)
-
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGK',
+            'count': IGK,
+            'total_count': total_counts,
+        })
 
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGL',
+            'count': IGL,
+            'total_count': total_counts,
+        })
 
+        with open(f'{outdir}/tmp.txt', 'w') as f:
+            f.write(f'Median UMIs per cell:{medians}\n')
+            f.write(f'Median IGH UMIs per Cell:{medianH}\n')
+            f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
+            f.write(f'Median IGL UMIs per Cell:{medianL}\n')
+            
+    df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
 
+    utils.gen_stat(df, stat_file)
 
 
 def go_assemble(args):
+    step_name = 'go_assemble'
+    step = Step(args, step_name)
     thread = int(args.thread)
     fastq_dir = args.fastq_dir
     outdir = args.outdir
@@ -232,6 +239,9 @@ def go_assemble(args):
     elif type == 'BCR':
         run_bracer(outdir, fastq_dir, species, thread)
 
+    go_assemble_summary(outdir, type)
+
+    step.clean_up()
 
 def get_opts_go_assemble(parser, sub_program):
     if sub_program:
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 803e97ab..661e84df 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -6,15 +6,15 @@ import datetime
 import pandas as pd
 from Bio.Seq import Seq
 import glob
+import re
+import numpy as np
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 import glob
 
 
-
 def tpm_count(ass_dir):
-	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recom' # ass_dir outdir/sample/04.go_assemble
-					f'binants.txt', sep='\t')
+	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  # ass_dir outdir/sample/04.go_assemble
 	productive = rec[rec['productive'] == True]
 	productive['TPM'] = ''
 	indx = list(productive.index)
@@ -53,6 +53,7 @@ def filtering(type, ass_dir, outdir):
 				trb = trb.sort_values(by='TPM', ascending=False)
 				trb = trb.head(1)
 				filtered = filtered.append(trb, ignore_index=True)
+
 		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
 	elif type == 'BCR':
@@ -81,10 +82,43 @@ def filtering(type, ass_dir, outdir):
 	return filtered
 
 
-def res_sum(type, ass_dir, outdir):
+@utils.add_log					
+def vdj_sum(args):
+
+	step_name = f"vdj_sum"
+	step = Step(args, step_name)
+
+	type = args.type
+	ass_dir = args.ass_dir
+	sample = args.sample
+	outdir = args.outdir
+	fastq_dir = args.fastq_dir
+
 	filtered = filtering(type, ass_dir, outdir)
 
+	fqs = glob.glob(f'{fastq_dir}/*.fq')
+	matched_bcs = len(fqs)
+
+	stat_file = outdir + '/stat.txt'
+
+	vdj_sum_summary = []
+
 	if type == 'TCR':
+
+		CB = filtered['cell_name'].tolist()
+
+		df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t')
+
+		all_cells = df_umi['cell_name'].tolist()
+
+		df_umi = df_umi.set_index('cell_name')
+
+		for i in all_cells:
+			if i in CB:
+				df_umi.loc[i, 'mark'] = 'CB'
+			else:
+				df_umi.loc[i, 'mark'] = 'UB'
+
 		count_a = filtered[filtered['locus'] == 'A'].shape[0]
 		count_b = filtered[filtered['locus'] == 'B'].shape[0]
 		paired_cell = pd.DataFrame(filtered['cell_name'].value_counts())
@@ -110,21 +144,89 @@ def res_sum(type, ass_dir, outdir):
 
 		per_count_data = pd.DataFrame()
 		per_count_data['cdr3s_aa'] = aaseqs
-		clone_count = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts())
-		clone_count.columns = ["frequency"]
-		proportation = []
-		sum = clone_count['frequency'].sum()
-		for f in list(clone_count['frequency']):
+		clonetypes = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts())
+		clonetypes.columns = ["Frequency"]
+		Percent = []
+		sum = clonetypes['Frequency'].sum()
+		for f in list(clonetypes['Frequency']):
 			p = f/sum
-			proportation.append(p)
-		clone_count['proportation'] = proportation
-		clone_count = clone_count.reset_index()
-		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t')
+			Percent.append(p)
+		clonetypes['Percent'] = Percent
+		clonetypes = clonetypes.reset_index()
+		clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
+		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+
+		vdj_sum_summary.append({
+			'item': 'Estimated Number of Cells',
+			'count': matched_bcs,
+			'total_count': matched_bcs,
+		})
+
+		vdj_sum_summary.append({
+			'item': 'Productive cells',
+			'count': productive_cells,
+			'total_count': matched_bcs
+		})
+
+		vdj_sum_summary.append({
+			'item': 'Cells with TRA',
+			'count': count_a,
+			'total_count': matched_bcs,
+		})
+
+		vdj_sum_summary.append({
+			'item': 'Cells with TRB',
+			'count': count_b,
+			'total_count': matched_bcs,
+		})
+
+		vdj_sum_summary.append({
+			'item': 'Cells with paired TRA and TRB',
+			'count': len(paired_cell),
+			'total_count': matched_bcs,
+		})
+
+		with open(f'{ass_dir}/tmp.txt', 'r') as f:
+			medians = []
+			for line in f:
+				line = line.rstrip('\n').split(':')
+				medians.append(int(line[1]))
+
+			vdj_sum_summary.append({
+				'item': 'Median UMIs per cell',
+				'count': medians[0],
+				'total_count': np.nan
+			})
+
+			vdj_sum_summary.append({
+				'item': 'Median TRA UMIs per cell',
+				'count': medians[1],
+				'total_count': np.nan	
+			})
+
+			vdj_sum_summary.append({
+				'item': 'Median TRB UMIs per cell',
+				'count': medians[2],
+				'total_count': np.nan
+			})
 
-		return productive_cells, count_a, count_b, paired_cell
 
 	elif type == 'BCR':
+
+		CB = filtered['CELL'].tolist()
+
+		df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t')
+
+		all_cells = df_umi['cell_name'].tolist()
+
+		df_umi = df_umi.set_index('cell_name')
+
+		for i in all_cells:
+			if i in CB:
+				df_umi.loc[i, 'mark'] = 'CB'
+			else:
+				df_umi.loc[i, 'mark'] = 'UB'
+
 		filtered_h = filtered[filtered['LOCUS'] == 'H']
 		filtered_k = filtered[filtered['LOCUS'] == 'K']
 		filtered_l = filtered[filtered['LOCUS'] == 'L']
@@ -174,57 +276,133 @@ def res_sum(type, ass_dir, outdir):
 		clones['CELLS'] = cells
 
 		clones["cdr3s_aa"] = aaseqs
-		clone_count = pd.DataFrame(clones['cdr3s_aa'].value_counts())
-		clone_count.columns = ["frequency"]
-		proportation = []
-		sum = clone_count['frequency'].sum()
-		for f in list(clone_count['frequency']):
+		clonetypes = pd.DataFrame(clones['cdr3s_aa'].value_counts())
+		clonetypes.columns = ["Frequency"]
+		Percent = []
+		sum = clonetypes['Frequency'].sum()
+		for f in list(clonetypes['Frequency']):
 			p = f/sum
-			proportation.append(p)
-		clone_count['proportation'] = proportation
-		clone_count = clone_count.reset_index()
-		clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t')
-
-		return productive_cells, filtered_h_count, filtered_k_count, filtered_l_count, paired_k, paired_l
+			Percent.append(p)
+		clonetypes['Percent'] = Percent
+		clonetypes = clonetypes.reset_index()
+		clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
+		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+
+
+		vdj_sum_summary.append({
+				'item': 'Matched cells',
+				'count': matched_bcs,
+				'total_count': matched_bcs
+		})
+
+		vdj_sum_summary.append({
+				'item': 'Productive cells',
+				'count': productive_cells,
+				'total_count': matched_bcs
+		})
+
+		vdj_sum_summary.append({
+				'item': 'Cells with IGH',
+				'count': filtered_h_count,
+				'total_count': matched_bcs
+		})	
+
+		vdj_sum_summary.append({
+				'item': 'Cells with IGK',
+				'count': filtered_k_count,
+				'total_count': matched_bcs
+		})
+
+		vdj_sum_summary.append({
+				'item': 'Cells with IGL',
+				'count': filtered_l_count,
+				'total_count': matched_bcs
+		})			
+
+		vdj_sum_summary.append({
+				'item': 'Cells with IGH and IGK',
+				'count': paired_k,
+				'total_count': matched_bcs
+		})
+
+		vdj_sum_summary.append({
+				'item': 'Cells with IGH and IGL',
+				'count': paired_l,
+				'total_count': matched_bcs
+		})
+
+		with open(f'{ass_dir}/tmp.txt', 'r') as f:
+			medians=[]
+			for line in f:
+				line = line.strip('\n').split(':')
+				medians.append(int(line[1]))
+
+			vdj_sum_summary.append({
+				'item': 'Median UMIs per cell',
+				'count': medians[0],
+				'total_count': np.nan
+			})
+
+			vdj_sum_summary.append({
+				'item': 'Median IGH UMIs per cell',
+				'count': medians[1],
+				'total_count': np.nan
+			})
+
+			vdj_sum_summary.append({
+				'item': 'Median IGK UMIs per cell',
+				'count': medians[2],
+				'total_count': np.nan
+			})
+
+			vdj_sum_summary.append({
+				'item': 'Median IGL UMIs per cell',
+				'count': medians[3],
+				'total_count': np.nan
+			})
+
+	df = pd.DataFrame(vdj_sum_summary, 
+		columns=['item', 'count', 'total_count'])
+
+	df['count'] = df['count'].apply(int)
+	
+	df['percent'] = df['count']/(df.total_count.astype('float')) * 100
 
+	df['percent'] = df['percent'].apply(
+		lambda x: round(x, 2)
+	)
+	df['count'] = df['count'].apply(utils.format_number)
 
 
-def get_stat(fastq_dir, ass_dir, outdir, type):
-	fqs = glob.glob(f'{fastq_dir}/*.fq')
-	matched_bcs = len(fqs)
+	def percent_str_func(row):
+		need_percent = bool(
+			re.search("Cells with", row["item"], flags=re.IGNORECASE))
+		if need_percent:
+			return "(" + str(row["percent"]) + "%)"
+		else:
+			return ""	
 
-	stat_file = outdir + '/stat.txt'
-	if type == 'TCR':
-		productive_cells, TRA_num, TRB_num, paired_num = res_sum(type, ass_dir, outdir)
+	df['percent_str'] = df.apply(
+		lambda row: percent_str_func(row), axis=1
+	)	
 
-		stat_text = pd.DataFrame({
-			'item': ['Matched cells', 'Productive cells', 'Cells with TRA', 'Cells with TRB', 'Cells with paired TRA and TRB'], 
-			'count': [matched_bcs, productive_cells, TRA_num, TRB_num, paired_num]
-		}, 
-		columns=['item', 'count'])
-		stat_text.to_csv(stat_file, sep=':', header=None, index=False)
+	def gen_stat(summary, stat_file):
+		stat = summary
+		stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
+		stat = stat.loc[:, ["item", "new_count"]]
+		stat.to_csv(stat_file, sep=":", header=None, index=False)
 
-	elif type == 'BCR':
-		productive_cells, H_num, K_num, L_num, H_K_num, H_L_num = res_sum(type, ass_dir,outdir)
+	gen_stat(df, stat_file)
 
-		stat_text = pd.DataFrame({
-			'item': ['Matched cells', 'Productive cells', 'Cells with IGH', 'Cells with IGK', 'Cells with IGL', 'Cells with IGH and IGK', 'Cells with IGH and IGL'],
-			'count': [matched_bcs, productive_cells, H_num, K_num, L_num, H_K_num, H_L_num]
-		}, 
-		columns=['item', 'count'])
-		stat_text.to_csv(stat_file, sep=":", header=None, index=False)
+# clonetype table
 
+	clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(x*100) + '%')
+	title = 'Clonetypes'
+	table_dict = step.get_table(title, 'clonetypes_table', clonetypes)
 
-@utils.add_log					
-def vdj_sum(args):
-	type = args.type
-	ass_dir = args.ass_dir
-	sample = args.sample
-	outdir = args.outdir
-	fastq_dir = args.fastq_dir
+	step.add_data_item(table_dict=table_dict)
 
-	get_stat(fastq_dir, ass_dir, outdir, type)
+	step.clean_up()
 
 
 def get_opts_vdj_sum(parser, sub_program):
-- 
Gitee


From 32abee2adb3e8570b49c0ec3a9a5bf3154381a99 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 3 Jun 2021 18:19:42 +0800
Subject: [PATCH 23/96] add vdj_sum (clonetypes table and cell) to reports

---
 celescope/templates/html/tracer_vdj/base.html | 10 ++---
 .../html/tracer_vdj/clonetypes_table.html     | 37 +++++++++++++++++++
 .../html/tracer_vdj/vdj_sum_summary.html      | 29 +++++++++++++++
 celescope/tracer_vdj/vdj_sum.py               |  2 +-
 4 files changed, 72 insertions(+), 6 deletions(-)
 create mode 100644 celescope/templates/html/tracer_vdj/clonetypes_table.html
 create mode 100644 celescope/templates/html/tracer_vdj/vdj_sum_summary.html

diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html
index 54ad567e..5318bb34 100755
--- a/celescope/templates/html/tracer_vdj/base.html
+++ b/celescope/templates/html/tracer_vdj/base.html
@@ -137,18 +137,18 @@
       {% include "html/common/cutadapt_summary.html"%}
       {% endif %}
 
-      {% if consensus_summary is defined %}
-      {% include "html/common/consensus_summary.html"%}
-      {% endif %}
-
       {% if split_fastq is defined %}
       {% include "html/tracer_vdj/split_fastq_summary.html"%}
       {% endif %}
 
       {% if go_assemble_summary is defined %}
       {% include "html/tracer_vdj/go_assemble_summary.html"%}
-      {% endif %}      
+      {% endif %}
 
+      {% if vdj_sum_summary is defined %}
+      {% include "html/tracer_vdj/vdj_sum_summary.html"%}
+      {% endif %}
+      
       {% if table_dict is defined %}
       {% include "html/vdj/clonetypes_table.html"%}
       {% endif %}
diff --git a/celescope/templates/html/tracer_vdj/clonetypes_table.html b/celescope/templates/html/tracer_vdj/clonetypes_table.html
new file mode 100644
index 00000000..c4510563
--- /dev/null
+++ b/celescope/templates/html/tracer_vdj/clonetypes_table.html
@@ -0,0 +1,37 @@
+<script> {% include "./js/jquery.dataTables.min.js" %} </script>
+<script> {% include "./js/dataTables.jqueryui.min.js" %} </script>
+
+<script> {% include "./js/jszip.min.js" %} </script>
+<script> {% include "./js/dataTables.buttons.min.js" %} </script>
+<script> {% include "./js/buttons.flash.min.js" %} </script>
+<script> {% include "./js/buttons.html5.min.js" %} </script>
+<script> {% include "./js/buttons.print.min.js" %} </script>
+
+<style> {% include "./css/jquery-ui.css" %}</style>
+<style> {% include "./css/dataTables.jqueryui.min.css" %}</style>
+<style> {% include "./css/buttons.dataTables.min.css" %}</style>
+
+<script>
+    $(document).ready(function () {
+        var table_id = {{table_dict['id']|safe}};
+        var table = $(table_id).DataTable({
+            dom: 'Bfrtip',
+            buttons: ['excel']
+        });
+        table.columns(-1).order('desc').draw();
+    });
+</script>
+
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+    <h2>{{ table_dict['title'] }}    <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i> </h2>
+    <div class="box1">
+      <div class="description" style="display: none;">
+        <p>Clonotypes ranked by frequency. For the full table, please refer to the "*clonotypes.tsv" file produced by the pipeline.</p>
+      </div>
+      <div style=" margin-left: 3%; margin-right:3%;">
+          {{table_dict['table'] | safe}}
+      </div>
+
+      <div class="clear" ></div>
+    </div>
+  </div>
\ No newline at end of file
diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
new file mode 100644
index 00000000..3c79d24f
--- /dev/null
+++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
@@ -0,0 +1,29 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+  <h2>Cell    <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+  <div class="box">
+    <div class="description" style="display: none;">
+      <p><b>Matched cells</b> : number of barcodes matched with transcriptome barcodes.</p>
+      <p><b>Productive cells</b> : number of cells which contain IGH, IGK or IGL.</p>
+      <p><b>Cells with IGH</b> : cells with full length IGH.</p>
+      <p><b>Cells with IGK</b> : cells with full length IGK.</p>
+      <p><b>Cells with IGL</b> : cells with full length IGL.</p>
+      <p><b>Cells with IGH and IGK</b> : cells with full length IGH and IGK.</p>
+      <p><b>Cells with IGH and IGL</b> : cells with full length IGH and IGL.</p>
+      <p><b>Median UMIs per cell</b> : Median total UMIs per cell.</p>
+      <p><b>Median IGH UMIs per cell</b> : Median UMIs mapped  to IGH.</p>
+      <p><b>Median IGK UMIs per cell</b> : Median UMIs mapped  to IGK.</p>
+      <p><b>Median IGL UMIs per cell</b> : Median UMIs mapped  to IGL.</p>
+  </div>
+    <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+      {% for item in vdj_sum_summary %}
+        <tr>
+          {% for i in item %} 
+          <td>{{ i|e }}</td>
+          {% endfor %}
+        </tr>
+      {% endfor %}
+    </table>
+
+    <div class="clear" ></div>
+  </div>
+</div>
\ No newline at end of file
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 661e84df..aec8dde6 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -396,7 +396,7 @@ def vdj_sum(args):
 
 # clonetype table
 
-	clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(x*100) + '%')
+	clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(round(x*100, 2)) + '%')
 	title = 'Clonetypes'
 	table_dict = step.get_table(title, 'clonetypes_table', clonetypes)
 
-- 
Gitee


From f2f275a2ec447ba4be1d54d8cc2d780ecbc20fad Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 4 Jun 2021 16:37:42 +0800
Subject: [PATCH 24/96] report1.0 total:TCR/BCR data, cell:matched barcode

---
 .../html/tracer_vdj/go_assemble_summary.html  | 34 +++++++++++++++++++
 .../html/tracer_vdj/vdj_sum_summary.html      |  6 ++++
 celescope/tracer_vdj/go_assemble.py           | 15 ++++----
 celescope/tracer_vdj/split_fastq.py           | 22 +++++++-----
 celescope/tracer_vdj/vdj_sum.py               | 34 +++++--------------
 5 files changed, 68 insertions(+), 43 deletions(-)
 create mode 100644 celescope/templates/html/tracer_vdj/go_assemble_summary.html

diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
new file mode 100644
index 00000000..3a1cdb11
--- /dev/null
+++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
@@ -0,0 +1,34 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+    <h2>Mapping     <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+    <div class="box">
+      <div class="description" style="display: none;">
+        <p><b>UMIs Mapped to IGH</b> : UMIs mapped confidently to IGH chain.</p>
+        <p><b>UMIs Mapped to IGL</b> : UMIs mapped confidently to IGL chain.</p>
+        <p><b>UMIs Mapped to IGK</b> : UMIs mapped confidently to IGK chain.</p>
+    </div>
+      <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+        {% for item in go_assemble_summary %}
+          {% if loop.index <= (loop.length+1)/2 %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+
+      <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
+        {% for item in go_assemble_summary %}
+          {% if loop.index > (loop.length+1)/2 %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+      <div class="clear" ></div>
+    </div>
+  </div>
\ No newline at end of file
diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
index 3c79d24f..f8881c7d 100644
--- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
+++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
@@ -24,6 +24,12 @@
       {% endfor %}
     </table>
 
+    <div id="myDivUMI" style="float: left; margin-left: 3%; margin-top: 1%;margin-right:0%; width: 47%">
+      {{ chart|safe }}
+    </div>
+
+    <hr />
+    
     <div class="clear" ></div>
   </div>
 </div>
\ No newline at end of file
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index cc8912a3..5750f192 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -44,6 +44,7 @@ def bracer(fq, outdir, species):
         f'--single_end '
         f'--small_index '
         f'--no_trimming '
+        f'-r '
         f'--species {species} '
         f'-c {BRACER_CONF} '
         f'{prefix} '
@@ -75,6 +76,7 @@ def tracer(fq, outdir, species):
         f'--single_end '
         f'--small_index '
         f'-m assembly '
+        f'-r '
         f'--species {species} '
         f'-c {CONF_PATH} '
         f'{fq} '
@@ -159,18 +161,17 @@ def go_assemble_summary(outdir, type):
         medianA = int(np.median(TRA_UMIs))
         TRB_UMIs_count = sum(TRB_UMIs)
         medianB = int(np.median(TRB_UMIs))
-        total_counts = TRA_UMIs_count + TRB_UMIs_count
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to TRA',
             'count': TRA_UMIs_count,
-            'total_count': total_counts,
+            'total_count': all_UMIs,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to TRB',
             'count': TRB_UMIs_count,
-            'total_count': total_counts,
+            'total_count': all_UMIs,
         })
 
         with open(f'{outdir}/tmp.txt', 'w') as f:
@@ -194,24 +195,22 @@ def go_assemble_summary(outdir, type):
         IGL = sum(IGL_UMIs)
         medianL = np.median(IGL_UMIs)
 
-        total_counts = IGH + IGK + IGL
-
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGH',
             'count': IGH,
-            'total_count': total_counts,
+            'total_count': all_UMIs,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGK',
             'count': IGK,
-            'total_count': total_counts,
+            'total_count': all_UMIs,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGL',
             'count': IGL,
-            'total_count': total_counts,
+            'total_count': all_UMIs,
         })
 
         with open(f'{outdir}/tmp.txt', 'w') as f:
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index 699be4c5..e9076627 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -60,6 +60,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
     reads_count_dict = {}  # all barcodes and reads num for each barcode
     umi_count_dict = defaultdict(list)
     umi_count = {}
+
     with pysam.FastxFile(fq) as fq:
         for entry in fq:
             attr = entry.name.split('_')
@@ -67,22 +68,25 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
             umi = attr[1]
             if barcode in barcodes:
                 barcode_reads_dict[barcode].append(entry)
-                if umi_count_dict[barcode].count(umi) == 0:
-                    umi_count_dict[barcode].append(umi)
+            if umi_count_dict[barcode].count(umi) == 0:
+                umi_count_dict[barcode].append(umi)
         for barcode in barcodes:
             reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
-
+      
+        for barcode in list(umi_count_dict.keys()):
             umi_count[barcode] = len(umi_count_dict[barcode])
 
-    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMIs_count'])
-    df_umi = df_umi.reset_index().rename(columns={'index': 'barcode'})
+    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])
+    df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'})
+
+    df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
 
-    reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['reads_count'])
-    reads_count = reads_count.reset_index().rename(columns={'index': 'barcode'})
+    reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount'])
+    reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'})
 
-    df_f = pd.merge(reads_count, df_umi, on='barcode', how='inner')
+    df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner')
 
-    df_f = df_f.set_index('barcode')
+    df_f = df_f.set_index('Barcode')
 
     i = 1
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index aec8dde6..759b0e02 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -11,6 +11,8 @@ import numpy as np
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 import glob
+from celescope.tools.cellranger3 import get_plot_elements
+import json
 
 
 def tpm_count(ass_dir):
@@ -93,6 +95,7 @@ def vdj_sum(args):
 	sample = args.sample
 	outdir = args.outdir
 	fastq_dir = args.fastq_dir
+	UMI_min = args.UMI_min
 
 	filtered = filtering(type, ass_dir, outdir)
 
@@ -102,22 +105,12 @@ def vdj_sum(args):
 	stat_file = outdir + '/stat.txt'
 
 	vdj_sum_summary = []
+	
+	count_umi = f'{fastq_dir}/../umi_count.tsv'
 
 	if type == 'TCR':
 
-		CB = filtered['cell_name'].tolist()
-
-		df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t')
-
-		all_cells = df_umi['cell_name'].tolist()
-
-		df_umi = df_umi.set_index('cell_name')
-
-		for i in all_cells:
-			if i in CB:
-				df_umi.loc[i, 'mark'] = 'CB'
-			else:
-				df_umi.loc[i, 'mark'] = 'UB'
+		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi))
 
 		count_a = filtered[filtered['locus'] == 'A'].shape[0]
 		count_b = filtered[filtered['locus'] == 'B'].shape[0]
@@ -213,19 +206,7 @@ def vdj_sum(args):
 
 	elif type == 'BCR':
 
-		CB = filtered['CELL'].tolist()
-
-		df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t')
-
-		all_cells = df_umi['cell_name'].tolist()
-
-		df_umi = df_umi.set_index('cell_name')
-
-		for i in all_cells:
-			if i in CB:
-				df_umi.loc[i, 'mark'] = 'CB'
-			else:
-				df_umi.loc[i, 'mark'] = 'UB'
+		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi))
 
 		filtered_h = filtered[filtered['LOCUS'] == 'H']
 		filtered_k = filtered[filtered['LOCUS'] == 'K']
@@ -411,6 +392,7 @@ def get_opts_vdj_sum(parser, sub_program):
 		parser.add_argument('--ass_dir', help='assemble dir', required=True)
 		parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
 	parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+	parser.add_argument('--UMI_min', help='int, min UMI per cell, if not set, will be counted by UMI rank 20', default='auto')
 
 
 
-- 
Gitee


From 3764a63fe9462052075724b933ff29ee44c7657f Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 4 Jun 2021 16:46:38 +0800
Subject: [PATCH 25/96] ranked by UMI

---
 celescope/tracer_vdj/split_fastq.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index e9076627..ebcff1f6 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -76,14 +76,20 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
         for barcode in list(umi_count_dict.keys()):
             umi_count[barcode] = len(umi_count_dict[barcode])
 
-    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])
+    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])  
+    df_umi = df_umi.sort_values(by='UMI', ascending=False)
     df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'})
 
-    df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
-
     reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount'])
     reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'})
 
+    CB = reads_count['Barcode'].tolist()
+
+    df_umi['mark'] = df_umi["Barcode"].apply(
+            lambda x: "CB" if (x in CB) else "UB")
+
+    df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
+
     df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner')
 
     df_f = df_f.set_index('Barcode')
-- 
Gitee


From 1797cc7b8e41bac10718d25da849be0baf7bd784 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 7 Jun 2021 13:22:38 +0800
Subject: [PATCH 26/96] rewrite clonetype table and productive cells for UMI

---
 .../html/tracer_vdj/go_assemble_summary.html  |   6 +
 celescope/tracer_vdj/go_assemble.py           |  16 ++
 celescope/tracer_vdj/split_fastq.py           |  81 +++---
 celescope/tracer_vdj/vdj_sum.py               | 243 ++++++++++--------
 4 files changed, 199 insertions(+), 147 deletions(-)

diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
index 3a1cdb11..768043e2 100644
--- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html
+++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
@@ -2,9 +2,15 @@
     <h2>Mapping     <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
     <div class="box">
       <div class="description" style="display: none;">
+        <p>If type is BCR:</p>
+        <p><b>All UMIs Mapped to IGH, IGL or IGK</b> : UMIs mapped confidently to IGH, IGL and IGK chain.</p>        
         <p><b>UMIs Mapped to IGH</b> : UMIs mapped confidently to IGH chain.</p>
         <p><b>UMIs Mapped to IGL</b> : UMIs mapped confidently to IGL chain.</p>
         <p><b>UMIs Mapped to IGK</b> : UMIs mapped confidently to IGK chain.</p>
+        <p>If type is TCR:</p>
+        <p><b>All UMIs Mapped to TRA or TRB</b> : UMIs mapped confidently to TRA and TRB chain.</p>        
+        <p><b>UMIs Mapped to TRA</b> : UMIs mapped confidently to TRA chain.</p>
+        <p><b>UMIs Mapped to TRB</b> : UMIs mapped confidently to TRB chain.</p>
     </div>
       <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
         {% for item in go_assemble_summary %}
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 5750f192..c456bae1 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -162,6 +162,14 @@ def go_assemble_summary(outdir, type):
         TRB_UMIs_count = sum(TRB_UMIs)
         medianB = int(np.median(TRB_UMIs))
 
+        totals = TRA_UMIs_count + TRB_UMIs_count
+
+        go_assemble_summary.append({
+            'item': f'All UMIs mapped to TRA or TRB',
+            'count': totals,
+            'total_count': all_UMIs, 
+        })
+
         go_assemble_summary.append({
             'item': f'UMIs mapped to TRA',
             'count': TRA_UMIs_count,
@@ -195,6 +203,14 @@ def go_assemble_summary(outdir, type):
         IGL = sum(IGL_UMIs)
         medianL = np.median(IGL_UMIs)
 
+        totals = IGH + IGK + IGL
+
+        go_assemble_summary.append({
+            'item': f'All UMIs mapped to IGH, IGL or IGK',
+            'count': totals,
+            'total_count': all_UMIs,            
+        })
+
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGH',
             'count': IGH,
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index ebcff1f6..ce496264 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -11,17 +11,21 @@ from celescope.tools.Step import Step, s_common
 
 
 @utils.add_log
-def annotation_barcodes(match_dir, type):
+def get_barcodes(match_dir, type):
+    """
+    get reversed barcodes
+    VDJ barcodes and RNA barcodes are complementary and reversed
+    """
     
-    cluster_data = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')
-    cluster_data = cluster_data[0]
-    cluster_type = pd.read_csv(cluster_data, sep='\t')
+    clusterFile = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')
+    clusterFile = clusterFile[0]
+    cluster_data = pd.read_csv(clusterFile, sep='\t')
 
     # filter barcodes
     if type == 'TCR':
-        clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster'])
+        clusters = cluster_data[cluster_data['cell_type'] == 'T cells']['cluster'].tolist()
     elif type == 'BCR':
-        clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster'])
+        clusters = cluster_data[cluster_data['cell_type'] == 'B cells']['cluster'].tolist()
 
     tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')
     tsne = tsne[0]
@@ -32,11 +36,11 @@ def annotation_barcodes(match_dir, type):
         tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist()
         barcodes += tmp
     # write barcodes
-    barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/')
-    barcodes_path = barcodes_path[0]
+    path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/')
+    path = path[0]
 
     res = [] 
-    with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh:
+    with open(f'{path}/reversed_barcodes.tsv', 'w') as fh:
         for barcode in barcodes:
             barcode = Seq(barcode)
             barcode_reversed = barcode.reverse_complement()
@@ -48,52 +52,52 @@ def annotation_barcodes(match_dir, type):
 
 
 @utils.add_log
-def get_fastq_to_assemble(fq_outdir, fq, barcodes):
+def get_fqs(fq_outdir, fq, barcodes):
     """
     split_fastq
+    split clean fq from cutadapt by procided barcodes
+    -Input: 
+        fq_outdir, splited fq file out dir.
+        fq, clean fq file.
+        barcodes, reversed barcodes from RNA data.
+    -Output:
+        'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark.
+        'fastq' dir, contains fqs.
     """
     if not os.path.exists(fq_outdir):
         os.makedirs(fq_outdir)
     
-    barcode_reads_dict = defaultdict(list)  # all barcodes from BCR vdj_dir paired with reads
-    # umi_count = defaultdict(list)
-    reads_count_dict = {}  # all barcodes and reads num for each barcode
-    umi_count_dict = defaultdict(list)
-    umi_count = {}
+    barcode_reads_dict = defaultdict(list)  # reads from VDJ data for each barcode
+    reads_count_dict = {} # reads count for each barcode
+
+    umi_dict = defaultdict(list) # umi list for each barcode
+    umi_count = {} # umi count for each barcode
 
     with pysam.FastxFile(fq) as fq:
         for entry in fq:
             attr = entry.name.split('_')
             barcode = attr[0]
             umi = attr[1]
-            if barcode in barcodes:
-                barcode_reads_dict[barcode].append(entry)
-            if umi_count_dict[barcode].count(umi) == 0:
-                umi_count_dict[barcode].append(umi)
-        for barcode in barcodes:
+            barcode_reads_dict[barcode].append(entry)
+            if umi_dict[barcode].count(umi) == 0:
+                umi_dict[barcode].append(umi)
+            
+        for barcode in list(umi_dict.keys()):
             reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
-      
-        for barcode in list(umi_count_dict.keys()):
-            umi_count[barcode] = len(umi_count_dict[barcode])
+            umi_count[barcode] = len(umi_dict[barcode])
 
     df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])  
-    df_umi = df_umi.sort_values(by='UMI', ascending=False)
     df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'})
 
     reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount'])
     reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'})
 
-    CB = reads_count['Barcode'].tolist()
-
-    df_umi['mark'] = df_umi["Barcode"].apply(
-            lambda x: "CB" if (x in CB) else "UB")
-
-    df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t')
-
     df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner')
 
     df_f = df_f.set_index('Barcode')
 
+    df_f = df_f.sort_values(by='UMI', ascending=False)
+
     i = 1
 
     for barcode in barcodes:
@@ -105,17 +109,18 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes):
                 f.write(str(entry) + '\n')
 
         if i % 1000 == 0:
-            get_fastq_to_assemble.logger.info(f'processed {i} cells')
+            get_fqs.logger.info(f'processed {i} cells')
 
         if i == len(barcodes):
-            get_fastq_to_assemble.logger.info(f'finally get {i} cells')
+            get_fqs.logger.info(f'finally get {i} cells')
 
         i += 1
+        
+    df_f['cell_name'].fillna(0, inplace=True)
     
     df_f = df_f.astype(int)
-    df_f.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t')
-        
-
+    df_f.to_csv(f'{fq_outdir}/../count.txt', sep='\t')
+    
 
 def split_fastq(args):
     type = args.type
@@ -126,9 +131,9 @@ def split_fastq(args):
     fq = args.fq
 
     fq_outdir = f'{outdir}/fastq'
-    barcodes = annotation_barcodes(match_dir, type)
+    barcodes = get_barcodes(match_dir, type)
         
-    get_fastq_to_assemble(fq_outdir, fq, barcodes)
+    get_fqs(fq_outdir, fq, barcodes)
 
 
 def get_opts_split_fastq(parser, sub_program):
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 759b0e02..29c08cf3 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -106,77 +106,94 @@ def vdj_sum(args):
 
 	vdj_sum_summary = []
 	
-	count_umi = f'{fastq_dir}/../umi_count.tsv'
+	count_umi_file = f'{fastq_dir}/../count.txt'
+
+	count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
+	
+	all_cells = count_umi.shape[0]
 
 	if type == 'TCR':
 
-		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi))
-
-		count_a = filtered[filtered['locus'] == 'A'].shape[0]
-		count_b = filtered[filtered['locus'] == 'B'].shape[0]
-		paired_cell = pd.DataFrame(filtered['cell_name'].value_counts())
-		productive_cells = paired_cell.shape[0]
-		unpaired_cell = paired_cell[paired_cell['cell_name'] == 1]
-		paired_cell = paired_cell[paired_cell['cell_name'] == 2]
-		paired_cell = list(paired_cell.index)
-
-		aaseqs = []
-		for cell in paired_cell:
-			temp = filtered[filtered['cell_name'] == cell]
-			temp_loci = list(temp['locus'])
-			temp_aaseq = list(temp['CDR3aa'])
-			string = 'TR{}:C{}F;TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0], temp_loci[1], temp_aaseq[1])
-			aaseqs.append(string)
-
-		for cell in list(unpaired_cell.index):
-			temp = filtered[filtered['cell_name'] == cell]
-			temp_loci = list(temp['locus'])
-			temp_aaseq = list(temp['CDR3aa'])
-			string = 'TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0])
-			aaseqs.append(string)
-
-		per_count_data = pd.DataFrame()
-		per_count_data['cdr3s_aa'] = aaseqs
-		clonetypes = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts())
-		clonetypes.columns = ["Frequency"]
-		Percent = []
+		productive_cells = set(filtered['cell_name'].tolist())
+
+		count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+		count_umi.to_csv(count_umi_file, sep='\t')
+
+		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
+
+		productive_cells_num = len(productive_cells)
+
+		TRA_chain = filtered[filtered['locus'] == 'A']
+		TRA_chain_num = TRA_chain.shape[0]
+		TRB_chain = filtered[filtered['locus'] == 'B']
+		TRB_chain_num = TRB_chain.shape[0]
+
+		TRAs, TRBs = [], []
+		paired_cell = 0
+		for cell in productive_cells:
+			tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
+			if tmp1.empty is not True:
+				chainA = tmp1['CDR3aa'].tolist()[0]
+				TRAs.append(chainA)
+			else:
+				TRAs.append('NaN')
+			
+			tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
+			if tmp2.empty is not True:
+				chainB = tmp2['CDR3aa'].tolist()[0]
+				TRBs.append(chainB)
+			else:
+				TRBs.append('NaN')
+			
+			if not tmp1.empty and not tmp2.empty:
+				paired_cell += 1
+
+		clonetypes_table = pd.DataFrame()
+		clonetypes_table['TRA_chain'] = TRAs
+		clonetypes_table['TRB_chain'] = TRBs
+		clonetypes_table['Frequency'] = ''
+
+		clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
+
 		sum = clonetypes['Frequency'].sum()
+		proportions = []
 		for f in list(clonetypes['Frequency']):
 			p = f/sum
-			Percent.append(p)
-		clonetypes['Percent'] = Percent
+			p = round(p, 4)
+			p = str(p * 100) + '%'
+			proportions.append(p)
+		clonetypes['Proportion'] = proportions
+		clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
 		clonetypes = clonetypes.reset_index()
-		clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
-		vdj_sum_summary.append({
-			'item': 'Estimated Number of Cells',
-			'count': matched_bcs,
-			'total_count': matched_bcs,
-		})
+		clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+		clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
+
+		clonetypes.to_csv(f'{outdir}/clonetypes.txt', sep='\t')
 
 		vdj_sum_summary.append({
-			'item': 'Productive cells',
-			'count': productive_cells,
-			'total_count': matched_bcs
+			'item': 'Estimated Number of Cells',
+			'count': productive_cells_num,
+			'total_count': all_cells,
 		})
 
 		vdj_sum_summary.append({
 			'item': 'Cells with TRA',
-			'count': count_a,
-			'total_count': matched_bcs,
+			'count': TRA_chain_num,
+			'total_count': all_cells,
 		})
 
 		vdj_sum_summary.append({
 			'item': 'Cells with TRB',
-			'count': count_b,
-			'total_count': matched_bcs,
+			'count': TRB_chain_num,
+			'total_count': all_cells,
 		})
 
 		vdj_sum_summary.append({
 			'item': 'Cells with paired TRA and TRB',
-			'count': len(paired_cell),
-			'total_count': matched_bcs,
+			'count': paired_cell,
+			'total_count': all_cells,
 		})
 
 		with open(f'{ass_dir}/tmp.txt', 'r') as f:
@@ -206,7 +223,15 @@ def vdj_sum(args):
 
 	elif type == 'BCR':
 
-		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi))
+		productive_cells = set(filtered['CELL'].tolist())
+
+		productive_cells_num = len(productive_cells)
+
+		count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+		count_umi.to_csv(count_umi_file, sep='\t')		
+
+		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
 
 		filtered_h = filtered[filtered['LOCUS'] == 'H']
 		filtered_k = filtered[filtered['LOCUS'] == 'K']
@@ -215,101 +240,102 @@ def vdj_sum(args):
 		filtered_k_count = filtered_k.shape[0]
 		filtered_l_count = filtered_l.shape[0]
 
-		paired_cell = pd.DataFrame(filtered['CELL'].value_counts())
-		productive_cells = paired_cell.shape[0]	
+		IGHs, IGKs, IGLs = [], [], []
 
-		paired_cell = pd.DataFrame(filtered['CELL'].value_counts())
-		productive_cells = paired_cell.shape[0]
-		unpaired_cell = paired_cell[paired_cell['CELL'] == 1]
-		paired_cell = paired_cell[paired_cell['CELL'] == 2]
-		paired_k = 0
-		paired_l = 0
+		paired_k, paired_l = 0, 0
 
-		clones = pd.DataFrame()
-		cells = list(paired_cell.index)
-		aaseqs = []
+		for cell in productive_cells:
+			tmp1 = filtered_h[filtered_h['CELL'] == cell]
+			if tmp1.empty is not True:
+				seq = tmp1['JUNCTION'].tolist()[0]
+				seq = Seq(seq)
+				aaseq = seq.translate()
+				IGHs.append(aaseq)
+			else:
+				IGHs.append('NaN')
+
+			tmp2 = filtered_l[filtered_l['CELL'] == cell]
+			if tmp2.empty is not True:
+				seq = tmp2['JUNCTION'].tolist()[0]
+				seq = Seq(seq)
+				aaseq = seq.translate()
+				IGLs.append(aaseq)
+			else:
+				IGLs.append('NaN')
+
+			tmp3 = filtered_k[filtered_k['CELL'] == cell]
+			if tmp3.empty is not True:
+				seq = tmp3['JUNCTION'].tolist()[0]
+				seq = Seq(seq)
+				aaseq = seq.translate()
+				IGKs.append(aaseq)
+			else:
+				IGKs.append('NaN')
 
-		for cell in cells:
-			if 'K' in list(filtered[filtered['CELL'] == cell]['LOCUS']):
-				paired_k += 1
-			elif 'L' in list(filtered[filtered['CELL'] == cell]['LOCUS']):
+			if not tmp1.empty and not tmp2.empty:
 				paired_l += 1
-			tep = filtered[filtered['CELL'] == cell]
-			tep_loci = list(tep['LOCUS'])
-			cdr3 = list(tep['JUNCTION'])
-			aaseq = []
-			for seq in cdr3:
-				seq = Seq(seq)
-				seq = seq.translate()
-				aaseq.append(seq)
-			string = 'IG{}:{};IG{}:{}'.format(tep_loci[0], aaseq[0], tep_loci[1], aaseq[1])
-			aaseqs.append(string)
-
-		for cell in list(unpaired_cell.index):
-			cells.append(cell)
-			locus = list(filtered[filtered['CELL'] == cell]['LOCUS'])
-			cdr3 = list(filtered[filtered['CELL'] == cell]['JUNCTION'])
-			seq = Seq(cdr3[0])
-			seq = seq.translate()
-			string = 'IG{}:{}'.format(locus[0], seq)
-			aaseqs.append(string)
-
-		clones['CELLS'] = cells
-
-		clones["cdr3s_aa"] = aaseqs
-		clonetypes = pd.DataFrame(clones['cdr3s_aa'].value_counts())
-		clonetypes.columns = ["Frequency"]
-		Percent = []
+			if not tmp1.empty and not tmp3.empty:
+				paired_k += 1
+
+		clonetypes_table = pd.DataFrame()
+
+		clonetypes_table['IGH_chain'] = IGHs
+		clonetypes_table['IGL_chain'] = IGLs
+		clonetypes_table['IGK_chain'] = IGKs
+		clonetypes_table['Frequency'] = ''
+
+		clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
+
+		Proportion = []
 		sum = clonetypes['Frequency'].sum()
 		for f in list(clonetypes['Frequency']):
 			p = f/sum
-			Percent.append(p)
-		clonetypes['Percent'] = Percent
+			p = round(p, 4)
+			p = str(p*100) + '%'
+			Proportion.append(p)
+		clonetypes['Proportion'] = Proportion
+		clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
 		clonetypes = clonetypes.reset_index()
-		clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True)
-		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
+		clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+		clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
+		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
-		vdj_sum_summary.append({
-				'item': 'Matched cells',
-				'count': matched_bcs,
-				'total_count': matched_bcs
-		})
 
 		vdj_sum_summary.append({
-				'item': 'Productive cells',
-				'count': productive_cells,
-				'total_count': matched_bcs
+				'item': 'Estimated Number of Cells',
+				'count': productive_cells_num,
+				'total_count': all_cells
 		})
 
 		vdj_sum_summary.append({
 				'item': 'Cells with IGH',
 				'count': filtered_h_count,
-				'total_count': matched_bcs
+				'total_count': all_cells
 		})	
 
 		vdj_sum_summary.append({
 				'item': 'Cells with IGK',
 				'count': filtered_k_count,
-				'total_count': matched_bcs
+				'total_count': all_cells
 		})
 
 		vdj_sum_summary.append({
 				'item': 'Cells with IGL',
 				'count': filtered_l_count,
-				'total_count': matched_bcs
+				'total_count': all_cells
 		})			
 
 		vdj_sum_summary.append({
 				'item': 'Cells with IGH and IGK',
 				'count': paired_k,
-				'total_count': matched_bcs
+				'total_count': all_cells
 		})
 
 		vdj_sum_summary.append({
 				'item': 'Cells with IGH and IGL',
 				'count': paired_l,
-				'total_count': matched_bcs
+				'total_count': all_cells
 		})
 
 		with open(f'{ass_dir}/tmp.txt', 'r') as f:
@@ -377,7 +403,6 @@ def vdj_sum(args):
 
 # clonetype table
 
-	clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(round(x*100, 2)) + '%')
 	title = 'Clonetypes'
 	table_dict = step.get_table(title, 'clonetypes_table', clonetypes)
 
-- 
Gitee


From 3a41427f093a351da8c06b85c06621fd975ab694 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 7 Jun 2021 15:03:57 +0800
Subject: [PATCH 27/96] class split_fastq

---
 celescope/tracer_vdj/split_fastq.py | 157 ++++++++++++++++------------
 1 file changed, 89 insertions(+), 68 deletions(-)

diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index ce496264..ac11ce47 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -10,7 +10,6 @@ from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 
 
-@utils.add_log
 def get_barcodes(match_dir, type):
     """
     get reversed barcodes
@@ -51,97 +50,119 @@ def get_barcodes(match_dir, type):
     return res
 
 
-@utils.add_log
-def get_fqs(fq_outdir, fq, barcodes):
+class Split_fastq(Step):
     """
-    split_fastq
-    split clean fq from cutadapt by procided barcodes
-    -Input: 
-        fq_outdir, splited fq file out dir.
-        fq, clean fq file.
-        barcodes, reversed barcodes from RNA data.
-    -Output:
-        'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark.
-        'fastq' dir, contains fqs.
+    Features
+
+    - Get reversed barcodes from RNA annotation results.
+    - Split clean R2 fastq file and count reads and UMIs for each barcode.
+
+    Output
+
+    - `03.split_fastq/count.txt`, 4 columns, barcodes, reads count, UMIs count, mark.
+    - `03.split_fastq/fastq`, fastq file directory for each barcode in reversed barcodes.
     """
-    if not os.path.exists(fq_outdir):
-        os.makedirs(fq_outdir)
-    
-    barcode_reads_dict = defaultdict(list)  # reads from VDJ data for each barcode
-    reads_count_dict = {} # reads count for each barcode
-
-    umi_dict = defaultdict(list) # umi list for each barcode
-    umi_count = {} # umi count for each barcode
-
-    with pysam.FastxFile(fq) as fq:
-        for entry in fq:
-            attr = entry.name.split('_')
-            barcode = attr[0]
-            umi = attr[1]
-            barcode_reads_dict[barcode].append(entry)
-            if umi_dict[barcode].count(umi) == 0:
-                umi_dict[barcode].append(umi)
-            
-        for barcode in list(umi_dict.keys()):
-            reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
-            umi_count[barcode] = len(umi_dict[barcode])
 
-    df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])  
-    df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'})
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.type = args.type
+        self.fq = args.fq
+        self.match_dir = args.match_dir
+        self.fq_outdir = f'{self.outdir}/fastq'
+
+        # out file name
+        self.count_file = f'{self.outdir}/{self.sample}_count.txt'
+
+    @utils.add_log
+    def get_fqs(self):
+        """
+        split_fastq
+        split clean fq from cutadapt by procided barcodes
+        -Input: 
+            fq_outdir, splited fq file out dir.
+            fq, clean fq file.
+            barcodes, reversed barcodes from RNA data.
+        -Output:
+            'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark.
+            'fastq' dir, contains fqs.
+        """
+        if not os.path.exists(self.fq_outdir):
+            os.makedirs(self.fq_outdir)
+
+        barcodes = get_barcodes(self.match_dir, self.type)
+        
+        barcode_reads_dict = defaultdict(list)  # reads from VDJ data for each barcode
+        reads_count_dict = {} # reads count for each barcode
+
+        umi_dict = defaultdict(list) # umi list for each barcode
+        umi_count = {} # umi count for each barcode
+
+        with pysam.FastxFile(self.fq) as fq:
+            for entry in fq:
+                attr = entry.name.split('_')
+                barcode = attr[0]
+                umi = attr[1]
+                barcode_reads_dict[barcode].append(entry)
+                if umi_dict[barcode].count(umi) == 0:
+                    umi_dict[barcode].append(umi)
+                
+            for barcode in list(umi_dict.keys()):
+                reads_count_dict[barcode] = len(barcode_reads_dict[barcode])
+                umi_count[barcode] = len(umi_dict[barcode])
 
-    reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount'])
-    reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'})
+        df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI'])  
+        df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'})
 
-    df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner')
+        reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount'])
+        reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'})
 
-    df_f = df_f.set_index('Barcode')
+        df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner')
 
-    df_f = df_f.sort_values(by='UMI', ascending=False)
+        df_f = df_f.set_index('Barcode')
 
-    i = 1
+        df_f = df_f.sort_values(by='UMI', ascending=False)
 
-    for barcode in barcodes:
+        i = 1
 
-        df_f.loc[barcode, 'cell_name'] = i
+        for barcode in barcodes:
+
+            df_f.loc[barcode, 'cell_name'] = i
 
-        with open(f'{fq_outdir}/{i}.fq', 'w') as f:
-            for entry in barcode_reads_dict[barcode]:
-                f.write(str(entry) + '\n')
+            with open(f'{self.fq_outdir}/{i}.fq', 'w') as f:
+                for entry in barcode_reads_dict[barcode]:
+                    f.write(str(entry) + '\n')
 
-        if i % 1000 == 0:
-            get_fqs.logger.info(f'processed {i} cells')
+            if i % 1000 == 0:
+                Split_fastq.get_fqs.logger.info(f'processed {i} cells')
 
-        if i == len(barcodes):
-            get_fqs.logger.info(f'finally get {i} cells')
+            if i == len(barcodes):
+                Split_fastq.get_fqs.logger.info(f'finally get {i} cells')
 
-        i += 1
+            i += 1
+            
+        df_f['cell_name'].fillna(0, inplace=True)
         
-    df_f['cell_name'].fillna(0, inplace=True)
-    
-    df_f = df_f.astype(int)
-    df_f.to_csv(f'{fq_outdir}/../count.txt', sep='\t')
-    
+        df_f = df_f.astype(int)
+        df_f.to_csv(self.count_file, sep='\t')
 
+        self.clean_up()
+
+
+@utils.add_log
 def split_fastq(args):
-    type = args.type
-    match_dir = args.match_dir
-    sample = args.sample
-    outdir = args.outdir
-    assay = args.assay
-    fq = args.fq
-
-    fq_outdir = f'{outdir}/fastq'
-    barcodes = get_barcodes(match_dir, type)
-        
-    get_fqs(fq_outdir, fq, barcodes)
+    step_name = 'split_fastq'
+    split_fastq_obj = Split_fastq(args, step_name)
+    split_fastq_obj.get_fqs()
 
 
-def get_opts_split_fastq(parser, sub_program):
+def get_opts_split_fastq(parser, sub_program=True):
     if sub_program:
         parser = s_common(parser)
         parser.add_argument('--fq', required=True)
         parser.add_argument('--match_dir', help='matched rna_dir')
     parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+
     
 
 
-- 
Gitee


From cc5dcd5606b5c377a6a4f8c5e368c5cd06cb4a1d Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 7 Jun 2021 15:56:35 +0800
Subject: [PATCH 28/96] add class Go_assemble

---
 celescope/tracer_vdj/go_assemble.py | 315 +++++++++++++++-------------
 1 file changed, 165 insertions(+), 150 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index c456bae1..9a025c40 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -19,6 +19,16 @@ BRACER_CONDA = 'bracer'
 BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 
+def get_umi_count(fq):
+    umis = []
+    with pysam.FastxFile(fq) as fh:
+        for entry in fh:
+            attr = entry.name.split('_')
+            barcode = attr[0]
+            umi = attr[1]
+            umis.append(umi)
+    res = len(set(umis))
+    return res
 # 开始组装
 
 
@@ -86,37 +96,55 @@ def tracer(fq, outdir, species):
     os.system(cmd)
 
 
-@utils.add_log
-def run_tracer(outdir, fastq_dir, species, thread):
+class Go_assemble(Step):
+    """
+    Features
 
-    fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
-    outdirs = [outdir] * len(fqs)
-    species = [species] * len(fqs)
-    if not os.path.exists(f'{outdir}/tracer'):
-        os.makedirs(f'{outdir}/tracer')
+    - Assemble TCR/BCR full length by tracer.
+    - Summary mapping rate.
 
-    all_res = []
-    with ProcessPoolExecutor(thread) as pool:
-        for res in pool.map(tracer, fqs, outdirs, species):
-            all_res.append(res)
+    Output
 
-    tracer_summarise(outdir)
+    - `04.go_assemble/tracer` or `04.go_assemble/bracer` Tracer output directory.
+    - `04.go_assemble/stat.txt` Recording mapping rate.
+    """
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        self.species = args.species
+        self.type = args.type
+        self.thread = int(args.thread)
+        self.fastq_dir = args.fastq_dir
 
 
-@utils.add_log
-def run_bracer(outdir, fastq_dir, species, thread):
-    fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))]
-    outdirs = [outdir] * len(fqs)
-    species = [species] * len(fqs)
-    if not os.path.exists(f'{outdir}/bracer'):
-        os.makedirs(f'{outdir}/bracer')
+    def run_tracer(self):
+
+        fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))]
+        outdirs = [self.outdir] * len(fqs)
+        species = [self.species] * len(fqs)
+        if not os.path.exists(f'{self.outdir}/tracer'):
+            os.makedirs(f'{self.outdir}/tracer')
+
+        all_res = []
+        with ProcessPoolExecutor(self.thread) as pool:
+            for res in pool.map(tracer, fqs, outdirs, species):
+                all_res.append(res)
 
-    all_res = []
-    with ProcessPoolExecutor(thread) as pool:
-        for res in pool.map(bracer, fqs, outdirs, species):
-            all_res.append(res)
+        tracer_summarise(self.outdir)
 
-    bracer_summarise(outdir)
+
+    def run_bracer(self):
+        fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))]
+        outdirs = [self.outdir] * len(fqs)
+        species = [self.species] * len(fqs)
+        if not os.path.exists(f'{self.outdir}/bracer'):
+            os.makedirs(f'{self.outdir}/bracer')
+
+        all_res = []
+        with ProcessPoolExecutor(self.thread) as pool:
+            for res in pool.map(bracer, fqs, outdirs, species):
+                all_res.append(res)
+
+        bracer_summarise(self.outdir)
 
 
 ################def get_reads_count(fq):
@@ -126,137 +154,124 @@ def run_bracer(outdir, fastq_dir, species, thread):
 #            count += 1
 #    return count
 
-
-def get_umi_count(fq):
-    umis = []
-    with pysam.FastxFile(fq) as fh:
-        for entry in fh:
-            attr = entry.name.split('_')
-            barcode = attr[0]
-            umi = attr[1]
-            umis.append(umi)
-    res = len(set(umis))
-    return res
-
-
-def go_assemble_summary(outdir, type):
-
-    total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv'
-    UMIs = pd.read_csv(total_fq, sep='\t')
-
-    all_UMIs = UMIs['UMIs_count'].tolist()
-    medians = int(np.median(all_UMIs))
-    all_UMIs = sum(all_UMIs)
-    
-    stat_file = outdir + '/../04.go_assemble/stat.txt'
-
-    go_assemble_summary = []
-
-    if type == 'TCR':
-        TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
-        TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
-        TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
-        TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
-        TRA_UMIs_count = sum(TRA_UMIs)
-        medianA = int(np.median(TRA_UMIs))
-        TRB_UMIs_count = sum(TRB_UMIs)
-        medianB = int(np.median(TRB_UMIs))
-
-        totals = TRA_UMIs_count + TRB_UMIs_count
-
-        go_assemble_summary.append({
-            'item': f'All UMIs mapped to TRA or TRB',
-            'count': totals,
-            'total_count': all_UMIs, 
-        })
-
-        go_assemble_summary.append({
-            'item': f'UMIs mapped to TRA',
-            'count': TRA_UMIs_count,
-            'total_count': all_UMIs,
-        })
-
-        go_assemble_summary.append({
-            'item': f'UMIs mapped to TRB',
-            'count': TRB_UMIs_count,
-            'total_count': all_UMIs,
-        })
-
-        with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Madian UMIs per cell:{medians}\n')
-            f.write(f'Median TRA UMIs per cell:{medianA}\n')
-            f.write(f'Median TRB UMIs per cell:{medianB}\n')
-
-    elif type == 'BCR':
-        IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
-        IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
-        IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
-
-        IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
-        IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
-        IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
-
-        IGH = sum(IGH_UMIs)
-        medianH = np.median(IGH_UMIs)
-        IGK = sum(IGK_UMIs)
-        medianK = np.median(IGK_UMIs)
-        IGL = sum(IGL_UMIs)
-        medianL = np.median(IGL_UMIs)
-
-        totals = IGH + IGK + IGL
-
-        go_assemble_summary.append({
-            'item': f'All UMIs mapped to IGH, IGL or IGK',
-            'count': totals,
-            'total_count': all_UMIs,            
-        })
-
-        go_assemble_summary.append({
-            'item': f'UMIs mapped to IGH',
-            'count': IGH,
-            'total_count': all_UMIs,
-        })
-
-        go_assemble_summary.append({
-            'item': f'UMIs mapped to IGK',
-            'count': IGK,
-            'total_count': all_UMIs,
-        })
-
-        go_assemble_summary.append({
-            'item': f'UMIs mapped to IGL',
-            'count': IGL,
-            'total_count': all_UMIs,
-        })
-
-        with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Median UMIs per cell:{medians}\n')
-            f.write(f'Median IGH UMIs per Cell:{medianH}\n')
-            f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
-            f.write(f'Median IGL UMIs per Cell:{medianL}\n')
-            
-    df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
-
-    utils.gen_stat(df, stat_file)
+    def go_assemble_summary(self):
+
+        count_file = f'{self.outdir}/../03.split_fastq/{self.sample}_count.txt'
+        UMIs = pd.read_csv(count_file, sep='\t')
+
+        all_UMIs = UMIs['UMIs_count'].tolist()
+        medians = int(np.median(all_UMIs))
+        all_UMIs = sum(all_UMIs)
+        
+        stat_file = self.outdir + '/stat.txt'
+
+        go_assemble_summary = []
+
+        if type == 'TCR':
+            TRAs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
+            TRBs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
+            TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
+            TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
+            TRA_UMIs_count = sum(TRA_UMIs)
+            medianA = int(np.median(TRA_UMIs))
+            TRB_UMIs_count = sum(TRB_UMIs)
+            medianB = int(np.median(TRB_UMIs))
+
+            totals = TRA_UMIs_count + TRB_UMIs_count
+
+            go_assemble_summary.append({
+                'item': f'All UMIs mapped to TRA or TRB',
+                'count': totals,
+                'total_count': all_UMIs, 
+            })
+
+            go_assemble_summary.append({
+                'item': f'UMIs mapped to TRA',
+                'count': TRA_UMIs_count,
+                'total_count': all_UMIs,
+            })
+
+            go_assemble_summary.append({
+                'item': f'UMIs mapped to TRB',
+                'count': TRB_UMIs_count,
+                'total_count': all_UMIs,
+            })
+
+            with open(f'{self.outdir}/tmp.txt', 'w') as f:
+                f.write(f'Madian UMIs per cell:{medians}\n')
+                f.write(f'Median TRA UMIs per cell:{medianA}\n')
+                f.write(f'Median TRB UMIs per cell:{medianB}\n')
+
+        elif type == 'BCR':
+            IGHs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
+            IGKs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
+            IGLs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
+
+            IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
+            IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
+            IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
+
+            IGH = sum(IGH_UMIs)
+            medianH = np.median(IGH_UMIs)
+            IGK = sum(IGK_UMIs)
+            medianK = np.median(IGK_UMIs)
+            IGL = sum(IGL_UMIs)
+            medianL = np.median(IGL_UMIs)
+
+            totals = IGH + IGK + IGL
+
+            go_assemble_summary.append({
+                'item': f'All UMIs mapped to IGH, IGL or IGK',
+                'count': totals,
+                'total_count': all_UMIs,            
+            })
+
+            go_assemble_summary.append({
+                'item': f'UMIs mapped to IGH',
+                'count': IGH,
+                'total_count': all_UMIs,
+            })
+
+            go_assemble_summary.append({
+                'item': f'UMIs mapped to IGK',
+                'count': IGK,
+                'total_count': all_UMIs,
+            })
+
+            go_assemble_summary.append({
+                'item': f'UMIs mapped to IGL',
+                'count': IGL,
+                'total_count': all_UMIs,
+            })
+
+            with open(f'{self.outdir}/tmp.txt', 'w') as f:
+                f.write(f'Median UMIs per cell:{medians}\n')
+                f.write(f'Median IGH UMIs per Cell:{medianH}\n')
+                f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
+                f.write(f'Median IGL UMIs per Cell:{medianL}\n')
+                
+        df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
+
+        utils.gen_stat(df, stat_file)
+
+        self.clean_up()
+
+
+    @utils.add_log
+    def run(self):
+        if self.type == 'TCR':
+            self.run_tracer()
+        elif self.type == 'BCR':
+            self.run_bracer()
+        self.go_assemble_summary()
 
 
+@utils.add_log
 def go_assemble(args):
     step_name = 'go_assemble'
-    step = Step(args, step_name)
-    thread = int(args.thread)
-    fastq_dir = args.fastq_dir
-    outdir = args.outdir
-    species = args.species
+    go_assemble_obj = Go_assemble(args, step_name)
+    go_assemble_obj.run()
     
-    type = args.type
-    if type == 'TCR':
-        run_tracer(outdir, fastq_dir, species, thread)
-    elif type == 'BCR':
-        run_bracer(outdir, fastq_dir, species, thread)
-
-    go_assemble_summary(outdir, type)
-
-    step.clean_up()
 
 def get_opts_go_assemble(parser, sub_program):
     if sub_program:
-- 
Gitee


From 78b942b8cdd9354035938340f606bbd271b4fdfe Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 7 Jun 2021 17:04:46 +0800
Subject: [PATCH 29/96] add Go_assemble class

---
 celescope/tracer_vdj/go_assemble.py | 247 +++++++++++++++-------------
 1 file changed, 135 insertions(+), 112 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 9a025c40..4d0fe95a 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -19,6 +19,23 @@ BRACER_CONDA = 'bracer'
 BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 
+
+def gen_stat(summary, stat_file):
+    stat = summary
+    stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
+    stat = stat.loc[:, ["item", "new_count"]]
+    stat.to_csv(stat_file, sep=":", header=None, index=False)
+
+
+def percent_str_func(row):
+	need_percent = bool(
+		re.search("Cells with", row["item"], flags=re.IGNORECASE))
+	if need_percent:
+		return "(" + str(row["percent"]) + "%)"
+	else:
+		return ""
+
+
 def get_umi_count(fq):
     umis = []
     with pysam.FastxFile(fq) as fh:
@@ -29,7 +46,121 @@ def get_umi_count(fq):
             umis.append(umi)
     res = len(set(umis))
     return res
-# 开始组装
+
+
+def assemble_summary(outdir, sample, type):
+
+    count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt'
+    UMIs = pd.read_csv(count_file, sep='\t')
+
+    all_ = UMIs['UMI'].tolist()
+    medians = int(np.median(all_))
+    all_UMIs = sum(all_)
+    
+    stat_file = outdir + '/stat.txt'
+
+    go_assemble_summary = []
+
+    if type == 'TCR':
+        TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
+        TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
+        TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
+        TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
+        TRA_UMIs_count = sum(TRA_UMIs)
+        medianA = int(np.median(TRA_UMIs))
+        TRB_UMIs_count = sum(TRB_UMIs)
+        medianB = int(np.median(TRB_UMIs))
+
+        totals = TRA_UMIs_count + TRB_UMIs_count
+
+        go_assemble_summary.append({
+            'item': f'All UMIs mapped to TRA or TRB',
+            'count': totals,
+            'total_count': all_UMIs, 
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to TRA',
+            'count': TRA_UMIs_count,
+            'total_count': all_UMIs,
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to TRB',
+            'count': TRB_UMIs_count,
+            'total_count': all_UMIs,
+        })
+
+        with open(f'{outdir}/tmp.txt', 'w') as f:
+            f.write(f'Madian UMIs per cell:{medians}\n')
+            f.write(f'Median TRA UMIs per cell:{medianA}\n')
+            f.write(f'Median TRB UMIs per cell:{medianB}\n')
+
+    elif type == 'BCR':
+        IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
+        IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
+        IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
+
+        IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
+        IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
+        IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
+
+        IGH = sum(IGH_UMIs)
+        medianH = np.median(IGH_UMIs)
+        IGK = sum(IGK_UMIs)
+        medianK = np.median(IGK_UMIs)
+        IGL = sum(IGL_UMIs)
+        medianL = np.median(IGL_UMIs)
+
+        totals = IGH + IGK + IGL
+
+        go_assemble_summary.append({
+            'item': f'All UMIs mapped to IGH, IGL or IGK',
+            'count': totals,
+            'total_count': all_UMIs,            
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGH',
+            'count': IGH,
+            'total_count': all_UMIs,
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGK',
+            'count': IGK,
+            'total_count': all_UMIs,
+        })
+
+        go_assemble_summary.append({
+            'item': f'UMIs mapped to IGL',
+            'count': IGL,
+            'total_count': all_UMIs,
+        })
+
+        with open(f'{outdir}/tmp.txt', 'w') as f:
+            f.write(f'Median UMIs per cell:{medians}\n')
+            f.write(f'Median IGH UMIs per Cell:{medianH}\n')
+            f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
+            f.write(f'Median IGL UMIs per Cell:{medianL}\n')
+            
+    df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
+
+    df['count'] = df['count'].apply(int)
+    
+    df['percent'] = df['count']/(df.total_count.astype('float')) * 100
+
+    df['percent'] = df['percent'].apply(
+        lambda x: round(x, 2)
+    )
+    df['count'] = df['count'].apply(utils.format_number)
+
+    df['percent_str'] = df.apply(
+        lambda row: percent_str_func(row), axis=1
+    )        
+
+    gen_stat(df, stat_file)
+
 
 
 def bracer_summarise(outdir):
@@ -131,6 +262,8 @@ class Go_assemble(Step):
 
         tracer_summarise(self.outdir)
 
+        assemble_summary(self.outdir, self.sample, self.type)
+
 
     def run_bracer(self):
         fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))]
@@ -146,116 +279,7 @@ class Go_assemble(Step):
 
         bracer_summarise(self.outdir)
 
-
-################def get_reads_count(fq):
-#    with pysam.FastxFile(fq) as fh:
-#        count = 0
-#        for entry in fh:
-#            count += 1
-#    return count
-
-    def go_assemble_summary(self):
-
-        count_file = f'{self.outdir}/../03.split_fastq/{self.sample}_count.txt'
-        UMIs = pd.read_csv(count_file, sep='\t')
-
-        all_UMIs = UMIs['UMIs_count'].tolist()
-        medians = int(np.median(all_UMIs))
-        all_UMIs = sum(all_UMIs)
-        
-        stat_file = self.outdir + '/stat.txt'
-
-        go_assemble_summary = []
-
-        if type == 'TCR':
-            TRAs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
-            TRBs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
-            TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
-            TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
-            TRA_UMIs_count = sum(TRA_UMIs)
-            medianA = int(np.median(TRA_UMIs))
-            TRB_UMIs_count = sum(TRB_UMIs)
-            medianB = int(np.median(TRB_UMIs))
-
-            totals = TRA_UMIs_count + TRB_UMIs_count
-
-            go_assemble_summary.append({
-                'item': f'All UMIs mapped to TRA or TRB',
-                'count': totals,
-                'total_count': all_UMIs, 
-            })
-
-            go_assemble_summary.append({
-                'item': f'UMIs mapped to TRA',
-                'count': TRA_UMIs_count,
-                'total_count': all_UMIs,
-            })
-
-            go_assemble_summary.append({
-                'item': f'UMIs mapped to TRB',
-                'count': TRB_UMIs_count,
-                'total_count': all_UMIs,
-            })
-
-            with open(f'{self.outdir}/tmp.txt', 'w') as f:
-                f.write(f'Madian UMIs per cell:{medians}\n')
-                f.write(f'Median TRA UMIs per cell:{medianA}\n')
-                f.write(f'Median TRB UMIs per cell:{medianB}\n')
-
-        elif type == 'BCR':
-            IGHs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
-            IGKs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
-            IGLs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
-
-            IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
-            IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
-            IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
-
-            IGH = sum(IGH_UMIs)
-            medianH = np.median(IGH_UMIs)
-            IGK = sum(IGK_UMIs)
-            medianK = np.median(IGK_UMIs)
-            IGL = sum(IGL_UMIs)
-            medianL = np.median(IGL_UMIs)
-
-            totals = IGH + IGK + IGL
-
-            go_assemble_summary.append({
-                'item': f'All UMIs mapped to IGH, IGL or IGK',
-                'count': totals,
-                'total_count': all_UMIs,            
-            })
-
-            go_assemble_summary.append({
-                'item': f'UMIs mapped to IGH',
-                'count': IGH,
-                'total_count': all_UMIs,
-            })
-
-            go_assemble_summary.append({
-                'item': f'UMIs mapped to IGK',
-                'count': IGK,
-                'total_count': all_UMIs,
-            })
-
-            go_assemble_summary.append({
-                'item': f'UMIs mapped to IGL',
-                'count': IGL,
-                'total_count': all_UMIs,
-            })
-
-            with open(f'{self.outdir}/tmp.txt', 'w') as f:
-                f.write(f'Median UMIs per cell:{medians}\n')
-                f.write(f'Median IGH UMIs per Cell:{medianH}\n')
-                f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
-                f.write(f'Median IGL UMIs per Cell:{medianL}\n')
-                
-        df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
-
-        utils.gen_stat(df, stat_file)
-
-        self.clean_up()
-
+        assemble_summary(self.outdir, self.sample, self.type)
 
     @utils.add_log
     def run(self):
@@ -263,7 +287,6 @@ class Go_assemble(Step):
             self.run_tracer()
         elif self.type == 'BCR':
             self.run_bracer()
-        self.go_assemble_summary()
 
 
 @utils.add_log
-- 
Gitee


From 7f1fc4e2f9aa71aeab08cd74e6436248c6b0ed8e Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 7 Jun 2021 17:54:57 +0800
Subject: [PATCH 30/96] add class vdj_sum and unify html text

---
 .../html/tracer_vdj/go_assemble_summary.html  |  14 +-
 .../html/tracer_vdj/vdj_sum_summary.html      |  30 +-
 celescope/tracer_vdj/go_assemble.py           |  22 +-
 celescope/tracer_vdj/vdj_sum.py               | 565 +++++++++---------
 4 files changed, 316 insertions(+), 315 deletions(-)

diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
index 768043e2..5bb8c0bf 100644
--- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html
+++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
@@ -3,14 +3,14 @@
     <div class="box">
       <div class="description" style="display: none;">
         <p>If type is BCR:</p>
-        <p><b>All UMIs Mapped to IGH, IGL or IGK</b> : UMIs mapped confidently to IGH, IGL and IGK chain.</p>        
-        <p><b>UMIs Mapped to IGH</b> : UMIs mapped confidently to IGH chain.</p>
-        <p><b>UMIs Mapped to IGL</b> : UMIs mapped confidently to IGL chain.</p>
-        <p><b>UMIs Mapped to IGK</b> : UMIs mapped confidently to IGK chain.</p>
+        <p><b>All UMIs Mapped to IGH, IGL and IGK</b>: UMIs confidently mapped to IGH, IGL and IGK chain.</p>        
+        <p><b>UMIs Mapped to IGH</b>: UMIs confidently mapped to IGH chain.</p>
+        <p><b>UMIs Mapped to IGL</b>: UMIs confidently mapped to IGL chain.</p>
+        <p><b>UMIs Mapped to IGK</b>: UMIs confidently mapped to IGK chain.</p>
         <p>If type is TCR:</p>
-        <p><b>All UMIs Mapped to TRA or TRB</b> : UMIs mapped confidently to TRA and TRB chain.</p>        
-        <p><b>UMIs Mapped to TRA</b> : UMIs mapped confidently to TRA chain.</p>
-        <p><b>UMIs Mapped to TRB</b> : UMIs mapped confidently to TRB chain.</p>
+        <p><b>All UMIs Mapped to TRA and TRB</b>: UMIs confidently mapped to TRA and TRB chain.</p>        
+        <p><b>UMIs Mapped to TRA</b>: UMIs confidently mapped to TRA chain.</p>
+        <p><b>UMIs Mapped to TRB</b>: UMIs confidently mapped to TRB chain.</p>
     </div>
       <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
         {% for item in go_assemble_summary %}
diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
index f8881c7d..7b5fdb08 100644
--- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
+++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
@@ -2,17 +2,25 @@
   <h2>Cell    <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
   <div class="box">
     <div class="description" style="display: none;">
-      <p><b>Matched cells</b> : number of barcodes matched with transcriptome barcodes.</p>
-      <p><b>Productive cells</b> : number of cells which contain IGH, IGK or IGL.</p>
-      <p><b>Cells with IGH</b> : cells with full length IGH.</p>
-      <p><b>Cells with IGK</b> : cells with full length IGK.</p>
-      <p><b>Cells with IGL</b> : cells with full length IGL.</p>
-      <p><b>Cells with IGH and IGK</b> : cells with full length IGH and IGK.</p>
-      <p><b>Cells with IGH and IGL</b> : cells with full length IGH and IGL.</p>
-      <p><b>Median UMIs per cell</b> : Median total UMIs per cell.</p>
-      <p><b>Median IGH UMIs per cell</b> : Median UMIs mapped  to IGH.</p>
-      <p><b>Median IGK UMIs per cell</b> : Median UMIs mapped  to IGK.</p>
-      <p><b>Median IGL UMIs per cell</b> : Median UMIs mapped  to IGL.</p>
+      <p>If type is BCR:</p>
+      <p><b>Estimated Number of Cells</b> : Number of cells which contain full length IGH, IGK or IGL.</p>
+      <p><b>Cells with IGH</b>: Cells with full length IGH.</p>
+      <p><b>Cells with IGK</b>: Cells with full length IGK.</p>
+      <p><b>Cells with IGL</b>: Cells with full length IGL.</p>
+      <p><b>Cells with paired IGH and IGK</b>: Cells with paired IGH and IGK.</p>
+      <p><b>Cells with paired IGH and IGL</b>: Cells with paired IGH and IGL.</p>
+      <p><b>Median UMIs per cell</b>: Median total UMIs per cell.</p>
+      <p><b>Median IGH UMIs per cell</b>: Median UMIs mapped to IGH.</p>
+      <p><b>Median IGK UMIs per cell</b>: Median UMIs mapped to IGK.</p>
+      <p><b>Median IGL UMIs per cell</b>: Median UMIs mapped to IGL.</p>
+      <p>If type is TCR:</p>
+      <p><b>Estimated Number of Cells</b>: Number of cells which contain full length TRA or TRB.</p>
+      <p><b>Cells with TRA</b>: Cells with full length TRA.</p>
+      <p><b>Cells with TRB</b>: Cells with full length TRB.</p>
+      <p><b>Cells with paired TRA and TRB</b>: Cells with paired TRA and TRB.</p>
+      <p><b>Median UMIs per cell</b>: Median UMIs mapped to TRA and TRB.</p>      
+      <p><b>Median TRA UMIs per cell</b>: Median UMIs mapped to TRA.</p>
+      <p><b>Median TRB UMIs per cell</b>: Median UMIs mapped to TRB.</p>          
   </div>
     <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
       {% for item in vdj_sum_summary %}
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 4d0fe95a..264febb0 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -74,7 +74,7 @@ def assemble_summary(outdir, sample, type):
         totals = TRA_UMIs_count + TRB_UMIs_count
 
         go_assemble_summary.append({
-            'item': f'All UMIs mapped to TRA or TRB',
+            'item': f'All UMIs mapped to TRA and TRB',
             'count': totals,
             'total_count': all_UMIs, 
         })
@@ -115,7 +115,7 @@ def assemble_summary(outdir, sample, type):
         totals = IGH + IGK + IGL
 
         go_assemble_summary.append({
-            'item': f'All UMIs mapped to IGH, IGL or IGK',
+            'item': f'All UMIs mapped to IGH, IGL and IGK',
             'count': totals,
             'total_count': all_UMIs,            
         })
@@ -146,21 +146,7 @@ def assemble_summary(outdir, sample, type):
             
     df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
 
-    df['count'] = df['count'].apply(int)
-    
-    df['percent'] = df['count']/(df.total_count.astype('float')) * 100
-
-    df['percent'] = df['percent'].apply(
-        lambda x: round(x, 2)
-    )
-    df['count'] = df['count'].apply(utils.format_number)
-
-    df['percent_str'] = df.apply(
-        lambda row: percent_str_func(row), axis=1
-    )        
-
-    gen_stat(df, stat_file)
-
+    utils.gen_stat(df, stat_file)
 
 
 def bracer_summarise(outdir):
@@ -288,6 +274,8 @@ class Go_assemble(Step):
         elif self.type == 'BCR':
             self.run_bracer()
 
+        self.clean_up()
+
 
 @utils.add_log
 def go_assemble(args):
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 29c08cf3..62492d27 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -13,6 +13,7 @@ from celescope.tools.Step import Step, s_common
 import glob
 from celescope.tools.cellranger3 import get_plot_elements
 import json
+from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat
 
 
 def tpm_count(ass_dir):
@@ -82,333 +83,337 @@ def filtering(type, ass_dir, outdir):
 		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
 	return filtered
+	
 
+class Vdj_sum(Step):
+	"""
+	Features
 
-@utils.add_log					
-def vdj_sum(args):
+	- Filter tracer results by TPM.
+	- Calculate clonetypes.
 
-	step_name = f"vdj_sum"
-	step = Step(args, step_name)
+	Output
 
-	type = args.type
-	ass_dir = args.ass_dir
-	sample = args.sample
-	outdir = args.outdir
-	fastq_dir = args.fastq_dir
-	UMI_min = args.UMI_min
+	- `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus.
+	- `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion.
+	"""
+	def __init__(self, args, step_name):
+		Step.__init__(self, args, step_name)
+		self.type = args.type
+		self.fastq_dir = args.fastq_dir
+		self.ass_dir = args.ass_dir
 
-	filtered = filtering(type, ass_dir, outdir)
 
-	fqs = glob.glob(f'{fastq_dir}/*.fq')
-	matched_bcs = len(fqs)
+	@utils.add_log					
+	def run(self):
+		ass_dir = self.ass_dir
+		outdir = self.outdir
+		fastq_dir = self.fastq_dir
+		type = self.type
 
-	stat_file = outdir + '/stat.txt'
+		results = filtering(type, ass_dir, outdir)
 
-	vdj_sum_summary = []
-	
-	count_umi_file = f'{fastq_dir}/../count.txt'
+		stat_file = outdir + '/stat.txt'
 
-	count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
-	
-	all_cells = count_umi.shape[0]
+		vdj_sum_summary = []
+		
+		count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt'
 
-	if type == 'TCR':
+		count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
+		
+		all_cells = count_umi.shape[0]
+
+		if type == 'TCR':
+
+			productive_cells = set(results['cell_name'].tolist())
+
+			count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+			count_umi.to_csv(count_umi_file, sep='\t')
 
-		productive_cells = set(filtered['cell_name'].tolist())
-
-		count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
-
-		count_umi.to_csv(count_umi_file, sep='\t')
-
-		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
-
-		productive_cells_num = len(productive_cells)
-
-		TRA_chain = filtered[filtered['locus'] == 'A']
-		TRA_chain_num = TRA_chain.shape[0]
-		TRB_chain = filtered[filtered['locus'] == 'B']
-		TRB_chain_num = TRB_chain.shape[0]
-
-		TRAs, TRBs = [], []
-		paired_cell = 0
-		for cell in productive_cells:
-			tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
-			if tmp1.empty is not True:
-				chainA = tmp1['CDR3aa'].tolist()[0]
-				TRAs.append(chainA)
-			else:
-				TRAs.append('NaN')
-			
-			tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
-			if tmp2.empty is not True:
-				chainB = tmp2['CDR3aa'].tolist()[0]
-				TRBs.append(chainB)
-			else:
-				TRBs.append('NaN')
-			
-			if not tmp1.empty and not tmp2.empty:
-				paired_cell += 1
-
-		clonetypes_table = pd.DataFrame()
-		clonetypes_table['TRA_chain'] = TRAs
-		clonetypes_table['TRB_chain'] = TRBs
-		clonetypes_table['Frequency'] = ''
-
-		clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
-
-		sum = clonetypes['Frequency'].sum()
-		proportions = []
-		for f in list(clonetypes['Frequency']):
-			p = f/sum
-			p = round(p, 4)
-			p = str(p * 100) + '%'
-			proportions.append(p)
-		clonetypes['Proportion'] = proportions
-		clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
-		clonetypes = clonetypes.reset_index()
-
-		clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-		clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
-
-		clonetypes.to_csv(f'{outdir}/clonetypes.txt', sep='\t')
-
-		vdj_sum_summary.append({
-			'item': 'Estimated Number of Cells',
-			'count': productive_cells_num,
-			'total_count': all_cells,
-		})
-
-		vdj_sum_summary.append({
-			'item': 'Cells with TRA',
-			'count': TRA_chain_num,
-			'total_count': all_cells,
-		})
-
-		vdj_sum_summary.append({
-			'item': 'Cells with TRB',
-			'count': TRB_chain_num,
-			'total_count': all_cells,
-		})
-
-		vdj_sum_summary.append({
-			'item': 'Cells with paired TRA and TRB',
-			'count': paired_cell,
-			'total_count': all_cells,
-		})
-
-		with open(f'{ass_dir}/tmp.txt', 'r') as f:
-			medians = []
-			for line in f:
-				line = line.rstrip('\n').split(':')
-				medians.append(int(line[1]))
+			self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
+
+			productive_cells_num = len(productive_cells)
+
+			TRA_chain = results[results['locus'] == 'A']
+			TRA_chain_num = TRA_chain.shape[0]
+			TRB_chain = results[results['locus'] == 'B']
+			TRB_chain_num = TRB_chain.shape[0]
+
+			TRAs, TRBs = [], []
+			paired_cell = 0
+			for cell in productive_cells:
+				tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
+				if tmp1.empty is not True:
+					chainA = tmp1['CDR3aa'].tolist()[0]
+					TRAs.append(chainA)
+				else:
+					TRAs.append('NaN')
+				
+				tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
+				if tmp2.empty is not True:
+					chainB = tmp2['CDR3aa'].tolist()[0]
+					TRBs.append(chainB)
+				else:
+					TRBs.append('NaN')
+				
+				if not tmp1.empty and not tmp2.empty:
+					paired_cell += 1
+
+			clonetypes_table = pd.DataFrame()
+			clonetypes_table['TRA_chain'] = TRAs
+			clonetypes_table['TRB_chain'] = TRBs
+			clonetypes_table['Frequency'] = ''
+
+			clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
+
+			sum = clonetypes['Frequency'].sum()
+			proportions = []
+			for f in list(clonetypes['Frequency']):
+				p = f/sum
+				p = round(p, 4)
+				p = str(p * 100) + '%'
+				proportions.append(p)
+			clonetypes['Proportion'] = proportions
+			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
+			clonetypes = clonetypes.reset_index()
+
+			clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+			clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
+
+			clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
 			vdj_sum_summary.append({
-				'item': 'Median UMIs per cell',
-				'count': medians[0],
-				'total_count': np.nan
+				'item': 'Estimated Number of Cells',
+				'count': productive_cells_num,
+				'total_count': all_cells,
 			})
 
 			vdj_sum_summary.append({
-				'item': 'Median TRA UMIs per cell',
-				'count': medians[1],
-				'total_count': np.nan	
+				'item': 'Cells with TRA',
+				'count': TRA_chain_num,
+				'total_count': all_cells,
 			})
 
 			vdj_sum_summary.append({
-				'item': 'Median TRB UMIs per cell',
-				'count': medians[2],
-				'total_count': np.nan
+				'item': 'Cells with TRB',
+				'count': TRB_chain_num,
+				'total_count': all_cells,
 			})
 
+			vdj_sum_summary.append({
+				'item': 'Cells with paired TRA and TRB',
+				'count': paired_cell,
+				'total_count': all_cells,
+			})
 
-	elif type == 'BCR':
+			with open(f'{ass_dir}/tmp.txt', 'r') as f:
+				medians = []
+				for line in f:
+					line = line.rstrip('\n').split(':')
+					medians.append(int(line[1]))
+
+				vdj_sum_summary.append({
+					'item': 'Median UMIs per cell',
+					'count': medians[0],
+					'total_count': np.nan
+				})
+
+				vdj_sum_summary.append({
+					'item': 'Median TRA UMIs per cell',
+					'count': medians[1],
+					'total_count': np.nan	
+				})
+
+				vdj_sum_summary.append({
+					'item': 'Median TRB UMIs per cell',
+					'count': medians[2],
+					'total_count': np.nan
+				})
+
+
+		elif type == 'BCR':
+
+			productive_cells = set(results['CELL'].tolist())
+
+			productive_cells_num = len(productive_cells)
+
+			count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+			count_umi.to_csv(count_umi_file, sep='\t')		
+
+			self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
+
+			results_h = results[results['LOCUS'] == 'H']
+			results_k = results[results['LOCUS'] == 'K']
+			results_l = results[results['LOCUS'] == 'L']
+			results_h_count = results_h.shape[0]
+			results_k_count = results_k.shape[0]
+			results_l_count = results_l.shape[0]
+
+			IGHs, IGKs, IGLs = [], [], []
+
+			paired_k, paired_l = 0, 0
+
+			for cell in productive_cells:
+				tmp1 = results_h[results_h['CELL'] == cell]
+				if tmp1.empty is not True:
+					seq = tmp1['JUNCTION'].tolist()[0]
+					seq = Seq(seq)
+					aaseq = seq.translate()
+					IGHs.append(aaseq)
+				else:
+					IGHs.append('NaN')
+
+				tmp2 = results_l[results_l['CELL'] == cell]
+				if tmp2.empty is not True:
+					seq = tmp2['JUNCTION'].tolist()[0]
+					seq = Seq(seq)
+					aaseq = seq.translate()
+					IGLs.append(aaseq)
+				else:
+					IGLs.append('NaN')
+
+				tmp3 = results_k[results_k['CELL'] == cell]
+				if tmp3.empty is not True:
+					seq = tmp3['JUNCTION'].tolist()[0]
+					seq = Seq(seq)
+					aaseq = seq.translate()
+					IGKs.append(aaseq)
+				else:
+					IGKs.append('NaN')
+
+				if not tmp1.empty and not tmp2.empty:
+					paired_l += 1
+				if not tmp1.empty and not tmp3.empty:
+					paired_k += 1
+
+			clonetypes_table = pd.DataFrame()
+
+			clonetypes_table['IGH_chain'] = IGHs
+			clonetypes_table['IGL_chain'] = IGLs
+			clonetypes_table['IGK_chain'] = IGKs
+			clonetypes_table['Frequency'] = ''
+
+			clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
+
+			Proportion = []
+			sum = clonetypes['Frequency'].sum()
+			for f in list(clonetypes['Frequency']):
+				p = f/sum
+				p = round(p, 4)
+				p = str(p*100) + '%'
+				Proportion.append(p)
+			clonetypes['Proportion'] = Proportion
+			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
+			clonetypes = clonetypes.reset_index()
+
+			clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+			clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
+			clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
-		productive_cells = set(filtered['CELL'].tolist())
-
-		productive_cells_num = len(productive_cells)
-
-		count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
-
-		count_umi.to_csv(count_umi_file, sep='\t')		
-
-		step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
-
-		filtered_h = filtered[filtered['LOCUS'] == 'H']
-		filtered_k = filtered[filtered['LOCUS'] == 'K']
-		filtered_l = filtered[filtered['LOCUS'] == 'L']
-		filtered_h_count = filtered_h.shape[0]
-		filtered_k_count = filtered_k.shape[0]
-		filtered_l_count = filtered_l.shape[0]
-
-		IGHs, IGKs, IGLs = [], [], []
-
-		paired_k, paired_l = 0, 0
-
-		for cell in productive_cells:
-			tmp1 = filtered_h[filtered_h['CELL'] == cell]
-			if tmp1.empty is not True:
-				seq = tmp1['JUNCTION'].tolist()[0]
-				seq = Seq(seq)
-				aaseq = seq.translate()
-				IGHs.append(aaseq)
-			else:
-				IGHs.append('NaN')
-
-			tmp2 = filtered_l[filtered_l['CELL'] == cell]
-			if tmp2.empty is not True:
-				seq = tmp2['JUNCTION'].tolist()[0]
-				seq = Seq(seq)
-				aaseq = seq.translate()
-				IGLs.append(aaseq)
-			else:
-				IGLs.append('NaN')
-
-			tmp3 = filtered_k[filtered_k['CELL'] == cell]
-			if tmp3.empty is not True:
-				seq = tmp3['JUNCTION'].tolist()[0]
-				seq = Seq(seq)
-				aaseq = seq.translate()
-				IGKs.append(aaseq)
-			else:
-				IGKs.append('NaN')
-
-			if not tmp1.empty and not tmp2.empty:
-				paired_l += 1
-			if not tmp1.empty and not tmp3.empty:
-				paired_k += 1
-
-		clonetypes_table = pd.DataFrame()
-
-		clonetypes_table['IGH_chain'] = IGHs
-		clonetypes_table['IGL_chain'] = IGLs
-		clonetypes_table['IGK_chain'] = IGKs
-		clonetypes_table['Frequency'] = ''
-
-		clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
-
-		Proportion = []
-		sum = clonetypes['Frequency'].sum()
-		for f in list(clonetypes['Frequency']):
-			p = f/sum
-			p = round(p, 4)
-			p = str(p*100) + '%'
-			Proportion.append(p)
-		clonetypes['Proportion'] = Proportion
-		clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
-		clonetypes = clonetypes.reset_index()
-
-		clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-		clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
-		clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
-
-
-		vdj_sum_summary.append({
-				'item': 'Estimated Number of Cells',
-				'count': productive_cells_num,
-				'total_count': all_cells
-		})
-
-		vdj_sum_summary.append({
-				'item': 'Cells with IGH',
-				'count': filtered_h_count,
-				'total_count': all_cells
-		})	
-
-		vdj_sum_summary.append({
-				'item': 'Cells with IGK',
-				'count': filtered_k_count,
-				'total_count': all_cells
-		})
-
-		vdj_sum_summary.append({
-				'item': 'Cells with IGL',
-				'count': filtered_l_count,
-				'total_count': all_cells
-		})			
-
-		vdj_sum_summary.append({
-				'item': 'Cells with IGH and IGK',
-				'count': paired_k,
-				'total_count': all_cells
-		})
-
-		vdj_sum_summary.append({
-				'item': 'Cells with IGH and IGL',
-				'count': paired_l,
-				'total_count': all_cells
-		})
-
-		with open(f'{ass_dir}/tmp.txt', 'r') as f:
-			medians=[]
-			for line in f:
-				line = line.strip('\n').split(':')
-				medians.append(int(line[1]))
 
 			vdj_sum_summary.append({
-				'item': 'Median UMIs per cell',
-				'count': medians[0],
-				'total_count': np.nan
+					'item': 'Estimated Number of Cells',
+					'count': productive_cells_num,
+					'total_count': all_cells
 			})
 
 			vdj_sum_summary.append({
-				'item': 'Median IGH UMIs per cell',
-				'count': medians[1],
-				'total_count': np.nan
+					'item': 'Cells with IGH',
+					'count': results_h_count,
+					'total_count': all_cells
+			})	
+
+			vdj_sum_summary.append({
+					'item': 'Cells with IGK',
+					'count': results_k_count,
+					'total_count': all_cells
 			})
 
 			vdj_sum_summary.append({
-				'item': 'Median IGK UMIs per cell',
-				'count': medians[2],
-				'total_count': np.nan
+					'item': 'Cells with IGL',
+					'count': results_l_count,
+					'total_count': all_cells
+			})			
+
+			vdj_sum_summary.append({
+					'item': 'Cells with paired IGH and IGK',
+					'count': paired_k,
+					'total_count': all_cells
 			})
 
 			vdj_sum_summary.append({
-				'item': 'Median IGL UMIs per cell',
-				'count': medians[3],
-				'total_count': np.nan
+					'item': 'Cells with paired IGH and IGL',
+					'count': paired_l,
+					'total_count': all_cells
 			})
 
-	df = pd.DataFrame(vdj_sum_summary, 
-		columns=['item', 'count', 'total_count'])
+			with open(f'{ass_dir}/tmp.txt', 'r') as f:
+				medians=[]
+				for line in f:
+					line = line.strip('\n').split(':')
+					medians.append(int(line[1]))
 
-	df['count'] = df['count'].apply(int)
-	
-	df['percent'] = df['count']/(df.total_count.astype('float')) * 100
+				vdj_sum_summary.append({
+					'item': 'Median UMIs per cell',
+					'count': medians[0],
+					'total_count': np.nan
+				})
+
+				vdj_sum_summary.append({
+					'item': 'Median IGH UMIs per cell',
+					'count': medians[1],
+					'total_count': np.nan
+				})
+
+				vdj_sum_summary.append({
+					'item': 'Median IGK UMIs per cell',
+					'count': medians[2],
+					'total_count': np.nan
+				})
 
-	df['percent'] = df['percent'].apply(
-		lambda x: round(x, 2)
-	)
-	df['count'] = df['count'].apply(utils.format_number)
+				vdj_sum_summary.append({
+					'item': 'Median IGL UMIs per cell',
+					'count': medians[3],
+					'total_count': np.nan
+				})
 
+		df = pd.DataFrame(vdj_sum_summary, 
+			columns=['item', 'count', 'total_count'])
 
-	def percent_str_func(row):
-		need_percent = bool(
-			re.search("Cells with", row["item"], flags=re.IGNORECASE))
-		if need_percent:
-			return "(" + str(row["percent"]) + "%)"
-		else:
-			return ""	
+		df['count'] = df['count'].apply(int)
+		
+		df['percent'] = df['count']/(df.total_count.astype('float')) * 100
 
-	df['percent_str'] = df.apply(
-		lambda row: percent_str_func(row), axis=1
-	)	
+		df['percent'] = df['percent'].apply(
+			lambda x: round(x, 2)
+		)
+		df['count'] = df['count'].apply(utils.format_number)
 
-	def gen_stat(summary, stat_file):
-		stat = summary
-		stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
-		stat = stat.loc[:, ["item", "new_count"]]
-		stat.to_csv(stat_file, sep=":", header=None, index=False)
+		df['percent_str'] = df.apply(
+			lambda row: percent_str_func(row), axis=1
+		)	
 
-	gen_stat(df, stat_file)
+		gen_stat(df, stat_file)
 
-# clonetype table
+	# clonetype table
 
-	title = 'Clonetypes'
-	table_dict = step.get_table(title, 'clonetypes_table', clonetypes)
+		title = 'Clonetypes'
+		table_dict = self.get_table(title, 'clonetypes_table', clonetypes)
 
-	step.add_data_item(table_dict=table_dict)
+		self.add_data_item(table_dict=table_dict)
 
-	step.clean_up()
+		self.clean_up()
+
+		os.remove(f'{ass_dir}/tmp.txt')
+
+
+@utils.add_log
+def vdj_sum(args):
+	step_name = 'vdj_sum'
+	vdj_sum_obj = Vdj_sum(args, step_name)
+	vdj_sum_obj.run()
 
 
 def get_opts_vdj_sum(parser, sub_program):
-- 
Gitee


From fcb0e22d67508240156dc359af2a74e4ace876cd Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 8 Jun 2021 16:32:17 +0800
Subject: [PATCH 31/96] solve vdj_sum and get_plot_elements warn

---
 .../tools/cellranger3/get_plot_elements.py    |  3 ++
 celescope/tracer_vdj/go_assemble.py           | 35 ++++++++++---------
 celescope/tracer_vdj/vdj_sum.py               | 34 +++++++++---------
 3 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py
index f4431f51..391ea744 100755
--- a/celescope/tools/cellranger3/get_plot_elements.py
+++ b/celescope/tools/cellranger3/get_plot_elements.py
@@ -117,6 +117,9 @@ def segment_log_plot_by_length(y_data, x_start, x_end):
     this_segment_len = 0.0
     segment_idx = [x_start]
 
+    np.seterr(divide = 'ignore')
+    np.seterr(invalid='ignore')
+
     for i in range(x_start, x_end):
         last_i = max(x_start, i-1)
         dx = (np.log(i) - np.log(last_i)) / log_max_x
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 264febb0..b94a448e 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -48,14 +48,11 @@ def get_umi_count(fq):
     return res
 
 
+@utils.add_log
 def assemble_summary(outdir, sample, type):
 
     count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt'
     UMIs = pd.read_csv(count_file, sep='\t')
-
-    all_ = UMIs['UMI'].tolist()
-    medians = int(np.median(all_))
-    all_UMIs = sum(all_)
     
     stat_file = outdir + '/stat.txt'
 
@@ -71,28 +68,31 @@ def assemble_summary(outdir, sample, type):
         TRB_UMIs_count = sum(TRB_UMIs)
         medianB = int(np.median(TRB_UMIs))
 
+        all_umi_count = TRA_UMIs + TRB_UMIs
+        medianAll = int(np.median(all_umi_count))
+
         totals = TRA_UMIs_count + TRB_UMIs_count
 
         go_assemble_summary.append({
             'item': f'All UMIs mapped to TRA and TRB',
             'count': totals,
-            'total_count': all_UMIs, 
+            'total_count': np.nan, 
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to TRA',
             'count': TRA_UMIs_count,
-            'total_count': all_UMIs,
+            'total_count': totals,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to TRB',
             'count': TRB_UMIs_count,
-            'total_count': all_UMIs,
+            'total_count': totals,
         })
 
         with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Madian UMIs per cell:{medians}\n')
+            f.write(f'Madian UMIs per cell:{medianAll}\n')
             f.write(f'Median TRA UMIs per cell:{medianA}\n')
             f.write(f'Median TRB UMIs per cell:{medianB}\n')
 
@@ -105,41 +105,44 @@ def assemble_summary(outdir, sample, type):
         IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
         IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
 
+        all_umi_count = IGH_UMIs + IGL_UMIs + IGK_UMIs
+        medianAll = int(np.median(all_umi_count))
+
         IGH = sum(IGH_UMIs)
-        medianH = np.median(IGH_UMIs)
+        medianH = int(np.median(IGH_UMIs))
         IGK = sum(IGK_UMIs)
-        medianK = np.median(IGK_UMIs)
+        medianK = int(np.median(IGK_UMIs))
         IGL = sum(IGL_UMIs)
-        medianL = np.median(IGL_UMIs)
+        medianL = int(np.median(IGL_UMIs))
 
         totals = IGH + IGK + IGL
 
         go_assemble_summary.append({
             'item': f'All UMIs mapped to IGH, IGL and IGK',
             'count': totals,
-            'total_count': all_UMIs,            
+            'total_count': np.nan,            
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGH',
             'count': IGH,
-            'total_count': all_UMIs,
+            'total_count': totals,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGK',
             'count': IGK,
-            'total_count': all_UMIs,
+            'total_count': totals,
         })
 
         go_assemble_summary.append({
             'item': f'UMIs mapped to IGL',
             'count': IGL,
-            'total_count': all_UMIs,
+            'total_count': totals,
         })
 
         with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Median UMIs per cell:{medians}\n')
+            f.write(f'Median UMIs per cell:{medianAll}\n')
             f.write(f'Median IGH UMIs per Cell:{medianH}\n')
             f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
             f.write(f'Median IGL UMIs per Cell:{medianL}\n')
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 62492d27..532ee516 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -19,18 +19,19 @@ from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat
 def tpm_count(ass_dir):
 	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  # ass_dir outdir/sample/04.go_assemble
 	productive = rec[rec['productive'] == True]
-	productive['TPM'] = ''
 	indx = list(productive.index)
+	tpms = []
 	for i in indx:
-		cell_name = productive.at[i, 'cell_name']
-		rec_id = productive.at[i, 'recombinant_id']
+		cell_name = productive.loc[i, 'cell_name']
+		rec_id = productive.loc[i, 'recombinant_id']
 		with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf:
 			for line in tsvf:
 				if rec_id in line:
 					line = line.rstrip()
 					line = line.split('\t')
 					tpm = float(line[4])
-					productive.loc[i, 'TPM'] = tpm
+					tpms.append(tpm)
+	productive.insert(loc=productive.shape[1], column='TPM', value=tpms)
 	
 	return productive
 
@@ -171,8 +172,9 @@ class Vdj_sum(Step):
 			proportions = []
 			for f in list(clonetypes['Frequency']):
 				p = f/sum
-				p = round(p, 4)
-				p = str(p * 100) + '%'
+				p = p * 100
+				p = round(p, 2)
+				p = str(p) + '%'
 				proportions.append(p)
 			clonetypes['Proportion'] = proportions
 			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
@@ -192,19 +194,19 @@ class Vdj_sum(Step):
 			vdj_sum_summary.append({
 				'item': 'Cells with TRA',
 				'count': TRA_chain_num,
-				'total_count': all_cells,
+				'total_count': productive_cells_num,
 			})
 
 			vdj_sum_summary.append({
 				'item': 'Cells with TRB',
 				'count': TRB_chain_num,
-				'total_count': all_cells,
+				'total_count': productive_cells_num,
 			})
 
 			vdj_sum_summary.append({
 				'item': 'Cells with paired TRA and TRB',
 				'count': paired_cell,
-				'total_count': all_cells,
+				'total_count': productive_cells_num,
 			})
 
 			with open(f'{ass_dir}/tmp.txt', 'r') as f:
@@ -301,8 +303,9 @@ class Vdj_sum(Step):
 			sum = clonetypes['Frequency'].sum()
 			for f in list(clonetypes['Frequency']):
 				p = f/sum
-				p = round(p, 4)
-				p = str(p*100) + '%'
+				p = p * 100
+				p = round(p, 2)
+				p = str(p) + '%'
 				Proportion.append(p)
 			clonetypes['Proportion'] = Proportion
 			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
@@ -322,25 +325,25 @@ class Vdj_sum(Step):
 			vdj_sum_summary.append({
 					'item': 'Cells with IGH',
 					'count': results_h_count,
-					'total_count': all_cells
+					'total_count': productive_cells_num
 			})	
 
 			vdj_sum_summary.append({
 					'item': 'Cells with IGK',
 					'count': results_k_count,
-					'total_count': all_cells
+					'total_count': productive_cells_num
 			})
 
 			vdj_sum_summary.append({
 					'item': 'Cells with IGL',
 					'count': results_l_count,
-					'total_count': all_cells
+					'total_count': productive_cells_num
 			})			
 
 			vdj_sum_summary.append({
 					'item': 'Cells with paired IGH and IGK',
 					'count': paired_k,
-					'total_count': all_cells
+					'total_count': productive_cells_num
 			})
 
 			vdj_sum_summary.append({
@@ -422,7 +425,6 @@ def get_opts_vdj_sum(parser, sub_program):
 		parser.add_argument('--ass_dir', help='assemble dir', required=True)
 		parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
 	parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
-	parser.add_argument('--UMI_min', help='int, min UMI per cell, if not set, will be counted by UMI rank 20', default='auto')
 
 
 
-- 
Gitee


From 56ce8b7f999b42eceff38c4ce01f3a8321cfbd49 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 8 Jun 2021 19:22:26 +0800
Subject: [PATCH 32/96] drop useless code

---
 celescope/tracer_vdj/go_assemble.py | 54 +++++++++++++----------------
 celescope/tracer_vdj/split_fastq.py | 14 ++++----
 celescope/tracer_vdj/vdj_sum.py     | 39 +++++++++------------
 3 files changed, 46 insertions(+), 61 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index b94a448e..8bc63b18 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -1,11 +1,10 @@
-import argparse
+import re
+import pandas as pd
 import os
 from os import listdir
 from os.path import isfile, join
 from concurrent.futures import ProcessPoolExecutor
 from celescope.tools import utils
-from celescope.tools.utils import *
-import datetime
 import glob
 import pysam
 import numpy as np
@@ -19,7 +18,6 @@ BRACER_CONDA = 'bracer'
 BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 
-
 def gen_stat(summary, stat_file):
     stat = summary
     stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
@@ -28,12 +26,11 @@ def gen_stat(summary, stat_file):
 
 
 def percent_str_func(row):
-	need_percent = bool(
-		re.search("Cells with", row["item"], flags=re.IGNORECASE))
-	if need_percent:
-		return "(" + str(row["percent"]) + "%)"
-	else:
-		return ""
+    need_percent = bool(re.search("Cells with", row["item"], flags=re.IGNORECASE))
+    if need_percent:
+        return "(" + str(row["percent"]) + "%)"
+    else:
+        return ""
 
 
 def get_umi_count(fq):
@@ -41,7 +38,6 @@ def get_umi_count(fq):
     with pysam.FastxFile(fq) as fh:
         for entry in fh:
             attr = entry.name.split('_')
-            barcode = attr[0]
             umi = attr[1]
             umis.append(umi)
     res = len(set(umis))
@@ -49,16 +45,14 @@ def get_umi_count(fq):
 
 
 @utils.add_log
-def assemble_summary(outdir, sample, type):
-
-    count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt'
-    UMIs = pd.read_csv(count_file, sep='\t')
+def assemble_summary(outdir, sample, Seqtype):
+    # UMIs = pd.read_csv(count_file, sep='\t')
     
     stat_file = outdir + '/stat.txt'
 
     go_assemble_summary = []
 
-    if type == 'TCR':
+    if Seqtype == 'TCR':
         TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
         TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
         TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
@@ -74,19 +68,19 @@ def assemble_summary(outdir, sample, type):
         totals = TRA_UMIs_count + TRB_UMIs_count
 
         go_assemble_summary.append({
-            'item': f'All UMIs mapped to TRA and TRB',
+            'item': 'All UMIs mapped to TRA and TRB',
             'count': totals,
             'total_count': np.nan, 
         })
 
         go_assemble_summary.append({
-            'item': f'UMIs mapped to TRA',
+            'item': 'UMIs mapped to TRA',
             'count': TRA_UMIs_count,
             'total_count': totals,
         })
 
         go_assemble_summary.append({
-            'item': f'UMIs mapped to TRB',
+            'item': 'UMIs mapped to TRB',
             'count': TRB_UMIs_count,
             'total_count': totals,
         })
@@ -96,7 +90,7 @@ def assemble_summary(outdir, sample, type):
             f.write(f'Median TRA UMIs per cell:{medianA}\n')
             f.write(f'Median TRB UMIs per cell:{medianB}\n')
 
-    elif type == 'BCR':
+    elif Seqtype == 'BCR':
         IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
         IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
         IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
@@ -118,25 +112,25 @@ def assemble_summary(outdir, sample, type):
         totals = IGH + IGK + IGL
 
         go_assemble_summary.append({
-            'item': f'All UMIs mapped to IGH, IGL and IGK',
+            'item': 'All UMIs mapped to IGH, IGL and IGK',
             'count': totals,
             'total_count': np.nan,            
         })
 
         go_assemble_summary.append({
-            'item': f'UMIs mapped to IGH',
+            'item': 'UMIs mapped to IGH',
             'count': IGH,
             'total_count': totals,
         })
 
         go_assemble_summary.append({
-            'item': f'UMIs mapped to IGK',
+            'item': 'UMIs mapped to IGK',
             'count': IGK,
             'total_count': totals,
         })
 
         go_assemble_summary.append({
-            'item': f'UMIs mapped to IGL',
+            'item': 'UMIs mapped to IGL',
             'count': IGL,
             'total_count': totals,
         })
@@ -231,7 +225,7 @@ class Go_assemble(Step):
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
         self.species = args.species
-        self.type = args.type
+        self.Seqtype = args.Seqtype
         self.thread = int(args.thread)
         self.fastq_dir = args.fastq_dir
 
@@ -251,7 +245,7 @@ class Go_assemble(Step):
 
         tracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.sample, self.type)
+        assemble_summary(self.outdir, self.sample, self.Seqtype)
 
 
     def run_bracer(self):
@@ -268,13 +262,13 @@ class Go_assemble(Step):
 
         bracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.sample, self.type)
+        assemble_summary(self.outdir, self.sample, self.Seqtype)
 
     @utils.add_log
     def run(self):
-        if self.type == 'TCR':
+        if self.Seqtype == 'TCR':
             self.run_tracer()
-        elif self.type == 'BCR':
+        elif self.Seqtype == 'BCR':
             self.run_bracer()
 
         self.clean_up()
@@ -291,6 +285,6 @@ def get_opts_go_assemble(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
         parser.add_argument('--fastq_dir', required=True)
-    parser.add_argument('--type', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+    parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
 
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index ac11ce47..a0b7973e 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -1,8 +1,6 @@
 import pysam
 from collections import defaultdict
 import os
-import argparse
-import datetime
 import pandas as pd
 from Bio.Seq import Seq
 import glob
@@ -10,7 +8,7 @@ from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 
 
-def get_barcodes(match_dir, type):
+def get_barcodes(match_dir, Seqtype):
     """
     get reversed barcodes
     VDJ barcodes and RNA barcodes are complementary and reversed
@@ -21,9 +19,9 @@ def get_barcodes(match_dir, type):
     cluster_data = pd.read_csv(clusterFile, sep='\t')
 
     # filter barcodes
-    if type == 'TCR':
+    if Seqtype == 'TCR':
         clusters = cluster_data[cluster_data['cell_type'] == 'T cells']['cluster'].tolist()
-    elif type == 'BCR':
+    elif Seqtype == 'BCR':
         clusters = cluster_data[cluster_data['cell_type'] == 'B cells']['cluster'].tolist()
 
     tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')
@@ -66,7 +64,7 @@ class Split_fastq(Step):
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
-        self.type = args.type
+        self.Seqtype = args.Seqtype
         self.fq = args.fq
         self.match_dir = args.match_dir
         self.fq_outdir = f'{self.outdir}/fastq'
@@ -90,7 +88,7 @@ class Split_fastq(Step):
         if not os.path.exists(self.fq_outdir):
             os.makedirs(self.fq_outdir)
 
-        barcodes = get_barcodes(self.match_dir, self.type)
+        barcodes = get_barcodes(self.match_dir, self.Seqtype)
         
         barcode_reads_dict = defaultdict(list)  # reads from VDJ data for each barcode
         reads_count_dict = {} # reads count for each barcode
@@ -161,7 +159,7 @@ def get_opts_split_fastq(parser, sub_program=True):
         parser = s_common(parser)
         parser.add_argument('--fq', required=True)
         parser.add_argument('--match_dir', help='matched rna_dir')
-    parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
 
     
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 532ee516..ae5eaad2 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -1,23 +1,16 @@
-import pysam
-from collections import defaultdict
 import os
-import argparse
-import datetime
 import pandas as pd
 from Bio.Seq import Seq
-import glob
-import re
 import numpy as np
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
-import glob
 from celescope.tools.cellranger3 import get_plot_elements
-import json
 from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat
 
 
 def tpm_count(ass_dir):
-	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  # ass_dir outdir/sample/04.go_assemble
+	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  
+	# ass_dir outdir/sample/04.go_assemble
 	productive = rec[rec['productive'] == True]
 	indx = list(productive.index)
 	tpms = []
@@ -32,16 +25,16 @@ def tpm_count(ass_dir):
 					tpm = float(line[4])
 					tpms.append(tpm)
 	productive.insert(loc=productive.shape[1], column='TPM', value=tpms)
-	
+
 	return productive
 
 
-def filtering(type, ass_dir, outdir):
+def filtering(Seqtype, ass_dir, outdir):
 
 	if not os.path.exists(outdir):
 		os.makedirs(outdir)
 
-	if type == 'TCR':
+	if Seqtype == 'TCR':
 		data = tpm_count(ass_dir)
 		cell_name = set(list(data['cell_name']))
 		filtered = pd.DataFrame()
@@ -60,7 +53,7 @@ def filtering(type, ass_dir, outdir):
 
 		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
-	elif type == 'BCR':
+	elif Seqtype == 'BCR':
 
 		data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
 		data = data[data['FUNCTIONAL'] == True]
@@ -100,7 +93,7 @@ class Vdj_sum(Step):
 	"""
 	def __init__(self, args, step_name):
 		Step.__init__(self, args, step_name)
-		self.type = args.type
+		self.Seqtype = args.Seqtype
 		self.fastq_dir = args.fastq_dir
 		self.ass_dir = args.ass_dir
 
@@ -110,9 +103,9 @@ class Vdj_sum(Step):
 		ass_dir = self.ass_dir
 		outdir = self.outdir
 		fastq_dir = self.fastq_dir
-		type = self.type
+		Seqtype = self.Seqtype
 
-		results = filtering(type, ass_dir, outdir)
+		results = filtering(Seqtype, ass_dir, outdir)
 
 		stat_file = outdir + '/stat.txt'
 
@@ -124,7 +117,7 @@ class Vdj_sum(Step):
 		
 		all_cells = count_umi.shape[0]
 
-		if type == 'TCR':
+		if Seqtype == 'TCR':
 
 			productive_cells = set(results['cell_name'].tolist())
 
@@ -168,10 +161,10 @@ class Vdj_sum(Step):
 
 			clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
 
-			sum = clonetypes['Frequency'].sum()
+			sum_c = clonetypes['Frequency'].sum()
 			proportions = []
 			for f in list(clonetypes['Frequency']):
-				p = f/sum
+				p = f/sum_c
 				p = p * 100
 				p = round(p, 2)
 				p = str(p) + '%'
@@ -234,7 +227,7 @@ class Vdj_sum(Step):
 				})
 
 
-		elif type == 'BCR':
+		elif Seqtype == 'BCR':
 
 			productive_cells = set(results['CELL'].tolist())
 
@@ -300,9 +293,9 @@ class Vdj_sum(Step):
 			clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
 
 			Proportion = []
-			sum = clonetypes['Frequency'].sum()
+			sum_c = clonetypes['Frequency'].sum()
 			for f in list(clonetypes['Frequency']):
-				p = f/sum
+				p = f/sum_c
 				p = p * 100
 				p = round(p, 2)
 				p = str(p) + '%'
@@ -424,7 +417,7 @@ def get_opts_vdj_sum(parser, sub_program):
 		parser = s_common(parser)
 		parser.add_argument('--ass_dir', help='assemble dir', required=True)
 		parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
-	parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+	parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
 
 
 
-- 
Gitee


From e43bce47f2617081496a881f1a4990d6abf9cd4d Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Wed, 9 Jun 2021 16:08:27 +0800
Subject: [PATCH 33/96] pylint code

---
 celescope/tracer_vdj/go_assemble.py      |   6 +-
 celescope/tracer_vdj/multi_tracer_vdj.py |   2 +-
 celescope/tracer_vdj/vdj_sum.py          | 779 +++++++++++------------
 3 files changed, 385 insertions(+), 402 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 8bc63b18..06ae5371 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -45,7 +45,7 @@ def get_umi_count(fq):
 
 
 @utils.add_log
-def assemble_summary(outdir, sample, Seqtype):
+def assemble_summary(outdir, Seqtype):
     # UMIs = pd.read_csv(count_file, sep='\t')
     
     stat_file = outdir + '/stat.txt'
@@ -245,7 +245,7 @@ class Go_assemble(Step):
 
         tracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.sample, self.Seqtype)
+        assemble_summary(self.outdir, self.Seqtype)
 
 
     def run_bracer(self):
@@ -262,7 +262,7 @@ class Go_assemble(Step):
 
         bracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.sample, self.Seqtype)
+        assemble_summary(self.outdir, self.Seqtype)
 
     @utils.add_log
     def run(self):
diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py
index 18ee9f98..5055ec9a 100755
--- a/celescope/tracer_vdj/multi_tracer_vdj.py
+++ b/celescope/tracer_vdj/multi_tracer_vdj.py
@@ -1,4 +1,4 @@
-from celescope.tracer_vdj.__init__ import __STEPS__, __ASSAY__
+from celescope.tracer_vdj.__init__ import __ASSAY__
 from celescope.tools.Multi import Multi
 
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index ae5eaad2..60d86d46 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -5,419 +5,402 @@ import numpy as np
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 from celescope.tools.cellranger3 import get_plot_elements
-from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat
 
 
 def tpm_count(ass_dir):
-	rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  
-	# ass_dir outdir/sample/04.go_assemble
-	productive = rec[rec['productive'] == True]
-	indx = list(productive.index)
-	tpms = []
-	for i in indx:
-		cell_name = productive.loc[i, 'cell_name']
-		rec_id = productive.loc[i, 'recombinant_id']
-		with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf:
-			for line in tsvf:
-				if rec_id in line:
-					line = line.rstrip()
-					line = line.split('\t')
-					tpm = float(line[4])
-					tpms.append(tpm)
-	productive.insert(loc=productive.shape[1], column='TPM', value=tpms)
-
-	return productive
+    rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  
+    # ass_dir outdir/sample/04.go_assemble
+    productive = rec[rec['productive'] == True]
+    indx = list(productive.index)
+    tpms = []
+    for i in indx:
+        cell_name = productive.loc[i, 'cell_name']
+        rec_id = productive.loc[i, 'recombinant_id']
+        with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf:
+            for line in tsvf:
+                if rec_id in line:
+                    line = line.rstrip()
+                    line = line.split('\t')
+                    tpm = float(line[4])
+                    tpms.append(tpm)
+    productive.insert(loc=productive.shape[1], column='TPM', value=tpms)
+
+    return productive
 
 
 def filtering(Seqtype, ass_dir, outdir):
-
-	if not os.path.exists(outdir):
-		os.makedirs(outdir)
-
-	if Seqtype == 'TCR':
-		data = tpm_count(ass_dir)
-		cell_name = set(list(data['cell_name']))
-		filtered = pd.DataFrame()
-		for name in cell_name:
-			count_data = data[data['cell_name'] == name]
-			tra = count_data[count_data['locus'] == 'A']
-			trb = count_data[count_data['locus'] == 'B']
-			if tra.empty is not True:
-				tra = tra.sort_values(by='TPM', ascending=False)
-				tra = tra.head(1)
-				filtered = filtered.append(tra, ignore_index=True)
-			if trb.empty is not True:
-				trb = trb.sort_values(by='TPM', ascending=False)
-				trb = trb.head(1)
-				filtered = filtered.append(trb, ignore_index=True)
-
-		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
-
-	elif Seqtype == 'BCR':
-
-		data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
-		data = data[data['FUNCTIONAL'] == True]
-		cell_name = set(list(data['CELL']))
-		filtered = pd.DataFrame()
-		for name in cell_name:
-			count_cell = data[data['CELL'] == name]
-			count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H'])
-			count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K'])
-			count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L'])
-			count_k_l = count_k.append(count_l)
-			if count_h.empty is not True:
-				count_h = count_h.sort_values(by='TPM', ascending=False)
-				count_h = count_h.head(1)
-				filtered = filtered.append(count_h, ignore_index=True)
-			if count_k_l.empty is not True:
-				count_k_l = count_k_l.sort_values(by='TPM', ascending=False)
-				count_k_l = count_k_l.head(1)
-				filtered = filtered.append(count_k_l, ignore_index=True)
-
-		filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
-
-	return filtered
-	
+    if not os.path.exists(outdir):
+        os.makedirs(outdir)
+
+    if Seqtype == 'TCR':
+        data = tpm_count(ass_dir)
+        cell_name = set(list(data['cell_name']))
+        filtered = pd.DataFrame()
+        for name in cell_name:
+            count_data = data[data['cell_name'] == name]
+            tra = count_data[count_data['locus'] == 'A']
+            trb = count_data[count_data['locus'] == 'B']
+            if tra.empty is not True:
+                tra = tra.sort_values(by='TPM', ascending=False)
+                tra = tra.head(1)
+                filtered = filtered.append(tra, ignore_index=True)
+            if trb.empty is not True:
+                trb = trb.sort_values(by='TPM', ascending=False)
+                trb = trb.head(1)
+                filtered = filtered.append(trb, ignore_index=True)
+
+        filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
+
+    elif Seqtype == 'BCR':
+
+        data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
+        data = data[data['FUNCTIONAL'] == True]
+        cell_name = set(list(data['CELL']))
+        filtered = pd.DataFrame()
+        for name in cell_name:
+            count_cell = data[data['CELL'] == name]
+            count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H'])
+            count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K'])
+            count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L'])
+            count_k_l = count_k.append(count_l)
+            if count_h.empty is not True:
+                count_h = count_h.sort_values(by='TPM', ascending=False)
+                count_h = count_h.head(1)
+                filtered = filtered.append(count_h, ignore_index=True)
+            if count_k_l.empty is not True:
+                count_k_l = count_k_l.sort_values(by='TPM', ascending=False)
+                count_k_l = count_k_l.head(1)
+                filtered = filtered.append(count_k_l, ignore_index=True)
+
+        filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
+
+    return filtered
+    
 
 class Vdj_sum(Step):
-	"""
-	Features
-
-	- Filter tracer results by TPM.
-	- Calculate clonetypes.
-
-	Output
-
-	- `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus.
-	- `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion.
-	"""
-	def __init__(self, args, step_name):
-		Step.__init__(self, args, step_name)
-		self.Seqtype = args.Seqtype
-		self.fastq_dir = args.fastq_dir
-		self.ass_dir = args.ass_dir
-
-
-	@utils.add_log					
-	def run(self):
-		ass_dir = self.ass_dir
-		outdir = self.outdir
-		fastq_dir = self.fastq_dir
-		Seqtype = self.Seqtype
-
-		results = filtering(Seqtype, ass_dir, outdir)
-
-		stat_file = outdir + '/stat.txt'
-
-		vdj_sum_summary = []
-		
-		count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt'
-
-		count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
-		
-		all_cells = count_umi.shape[0]
-
-		if Seqtype == 'TCR':
-
-			productive_cells = set(results['cell_name'].tolist())
-
-			count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
-
-			count_umi.to_csv(count_umi_file, sep='\t')
-
-			self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
-
-			productive_cells_num = len(productive_cells)
-
-			TRA_chain = results[results['locus'] == 'A']
-			TRA_chain_num = TRA_chain.shape[0]
-			TRB_chain = results[results['locus'] == 'B']
-			TRB_chain_num = TRB_chain.shape[0]
-
-			TRAs, TRBs = [], []
-			paired_cell = 0
-			for cell in productive_cells:
-				tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
-				if tmp1.empty is not True:
-					chainA = tmp1['CDR3aa'].tolist()[0]
-					TRAs.append(chainA)
-				else:
-					TRAs.append('NaN')
-				
-				tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
-				if tmp2.empty is not True:
-					chainB = tmp2['CDR3aa'].tolist()[0]
-					TRBs.append(chainB)
-				else:
-					TRBs.append('NaN')
-				
-				if not tmp1.empty and not tmp2.empty:
-					paired_cell += 1
-
-			clonetypes_table = pd.DataFrame()
-			clonetypes_table['TRA_chain'] = TRAs
-			clonetypes_table['TRB_chain'] = TRBs
-			clonetypes_table['Frequency'] = ''
-
-			clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
-
-			sum_c = clonetypes['Frequency'].sum()
-			proportions = []
-			for f in list(clonetypes['Frequency']):
-				p = f/sum_c
-				p = p * 100
-				p = round(p, 2)
-				p = str(p) + '%'
-				proportions.append(p)
-			clonetypes['Proportion'] = proportions
-			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
-			clonetypes = clonetypes.reset_index()
-
-			clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-			clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
-
-			clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
-
-			vdj_sum_summary.append({
-				'item': 'Estimated Number of Cells',
-				'count': productive_cells_num,
-				'total_count': all_cells,
-			})
-
-			vdj_sum_summary.append({
-				'item': 'Cells with TRA',
-				'count': TRA_chain_num,
-				'total_count': productive_cells_num,
-			})
-
-			vdj_sum_summary.append({
-				'item': 'Cells with TRB',
-				'count': TRB_chain_num,
-				'total_count': productive_cells_num,
-			})
-
-			vdj_sum_summary.append({
-				'item': 'Cells with paired TRA and TRB',
-				'count': paired_cell,
-				'total_count': productive_cells_num,
-			})
-
-			with open(f'{ass_dir}/tmp.txt', 'r') as f:
-				medians = []
-				for line in f:
-					line = line.rstrip('\n').split(':')
-					medians.append(int(line[1]))
-
-				vdj_sum_summary.append({
-					'item': 'Median UMIs per cell',
-					'count': medians[0],
-					'total_count': np.nan
-				})
-
-				vdj_sum_summary.append({
-					'item': 'Median TRA UMIs per cell',
-					'count': medians[1],
-					'total_count': np.nan	
-				})
-
-				vdj_sum_summary.append({
-					'item': 'Median TRB UMIs per cell',
-					'count': medians[2],
-					'total_count': np.nan
-				})
-
-
-		elif Seqtype == 'BCR':
-
-			productive_cells = set(results['CELL'].tolist())
-
-			productive_cells_num = len(productive_cells)
-
-			count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
-
-			count_umi.to_csv(count_umi_file, sep='\t')		
-
-			self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
-
-			results_h = results[results['LOCUS'] == 'H']
-			results_k = results[results['LOCUS'] == 'K']
-			results_l = results[results['LOCUS'] == 'L']
-			results_h_count = results_h.shape[0]
-			results_k_count = results_k.shape[0]
-			results_l_count = results_l.shape[0]
-
-			IGHs, IGKs, IGLs = [], [], []
-
-			paired_k, paired_l = 0, 0
-
-			for cell in productive_cells:
-				tmp1 = results_h[results_h['CELL'] == cell]
-				if tmp1.empty is not True:
-					seq = tmp1['JUNCTION'].tolist()[0]
-					seq = Seq(seq)
-					aaseq = seq.translate()
-					IGHs.append(aaseq)
-				else:
-					IGHs.append('NaN')
-
-				tmp2 = results_l[results_l['CELL'] == cell]
-				if tmp2.empty is not True:
-					seq = tmp2['JUNCTION'].tolist()[0]
-					seq = Seq(seq)
-					aaseq = seq.translate()
-					IGLs.append(aaseq)
-				else:
-					IGLs.append('NaN')
-
-				tmp3 = results_k[results_k['CELL'] == cell]
-				if tmp3.empty is not True:
-					seq = tmp3['JUNCTION'].tolist()[0]
-					seq = Seq(seq)
-					aaseq = seq.translate()
-					IGKs.append(aaseq)
-				else:
-					IGKs.append('NaN')
-
-				if not tmp1.empty and not tmp2.empty:
-					paired_l += 1
-				if not tmp1.empty and not tmp3.empty:
-					paired_k += 1
-
-			clonetypes_table = pd.DataFrame()
-
-			clonetypes_table['IGH_chain'] = IGHs
-			clonetypes_table['IGL_chain'] = IGLs
-			clonetypes_table['IGK_chain'] = IGKs
-			clonetypes_table['Frequency'] = ''
-
-			clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
-
-			Proportion = []
-			sum_c = clonetypes['Frequency'].sum()
-			for f in list(clonetypes['Frequency']):
-				p = f/sum_c
-				p = p * 100
-				p = round(p, 2)
-				p = str(p) + '%'
-				Proportion.append(p)
-			clonetypes['Proportion'] = Proportion
-			clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
-			clonetypes = clonetypes.reset_index()
-
-			clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-			clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
-			clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
-
-
-			vdj_sum_summary.append({
-					'item': 'Estimated Number of Cells',
-					'count': productive_cells_num,
-					'total_count': all_cells
-			})
-
-			vdj_sum_summary.append({
-					'item': 'Cells with IGH',
-					'count': results_h_count,
-					'total_count': productive_cells_num
-			})	
-
-			vdj_sum_summary.append({
-					'item': 'Cells with IGK',
-					'count': results_k_count,
-					'total_count': productive_cells_num
-			})
-
-			vdj_sum_summary.append({
-					'item': 'Cells with IGL',
-					'count': results_l_count,
-					'total_count': productive_cells_num
-			})			
-
-			vdj_sum_summary.append({
-					'item': 'Cells with paired IGH and IGK',
-					'count': paired_k,
-					'total_count': productive_cells_num
-			})
-
-			vdj_sum_summary.append({
-					'item': 'Cells with paired IGH and IGL',
-					'count': paired_l,
-					'total_count': all_cells
-			})
-
-			with open(f'{ass_dir}/tmp.txt', 'r') as f:
-				medians=[]
-				for line in f:
-					line = line.strip('\n').split(':')
-					medians.append(int(line[1]))
-
-				vdj_sum_summary.append({
-					'item': 'Median UMIs per cell',
-					'count': medians[0],
-					'total_count': np.nan
-				})
-
-				vdj_sum_summary.append({
-					'item': 'Median IGH UMIs per cell',
-					'count': medians[1],
-					'total_count': np.nan
-				})
-
-				vdj_sum_summary.append({
-					'item': 'Median IGK UMIs per cell',
-					'count': medians[2],
-					'total_count': np.nan
-				})
-
-				vdj_sum_summary.append({
-					'item': 'Median IGL UMIs per cell',
-					'count': medians[3],
-					'total_count': np.nan
-				})
-
-		df = pd.DataFrame(vdj_sum_summary, 
-			columns=['item', 'count', 'total_count'])
-
-		df['count'] = df['count'].apply(int)
-		
-		df['percent'] = df['count']/(df.total_count.astype('float')) * 100
-
-		df['percent'] = df['percent'].apply(
-			lambda x: round(x, 2)
-		)
-		df['count'] = df['count'].apply(utils.format_number)
-
-		df['percent_str'] = df.apply(
-			lambda row: percent_str_func(row), axis=1
-		)	
-
-		gen_stat(df, stat_file)
-
-	# clonetype table
-
-		title = 'Clonetypes'
-		table_dict = self.get_table(title, 'clonetypes_table', clonetypes)
-
-		self.add_data_item(table_dict=table_dict)
-
-		self.clean_up()
-
-		os.remove(f'{ass_dir}/tmp.txt')
+    """
+    Features
+
+    - Filter tracer results by TPM.
+    - Calculate clonetypes.
+
+    Output
+
+    - `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus.
+    - `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion.
+    """
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        self.Seqtype = args.Seqtype
+        self.fastq_dir = args.fastq_dir
+        self.ass_dir = args.ass_dir
+
+
+    @utils.add_log                    
+    def run(self):
+        ass_dir = self.ass_dir
+        outdir = self.outdir
+        fastq_dir = self.fastq_dir
+        Seqtype = self.Seqtype
+
+        results = filtering(Seqtype, ass_dir, outdir)
+
+        stat_file = outdir + '/stat.txt'
+
+        vdj_sum_summary = []
+        
+        count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt'
+
+        count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
+
+        if Seqtype == 'TCR':
+
+            productive_cells = set(results['cell_name'].tolist())
+
+            count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+            count_umi.to_csv(count_umi_file, sep='\t')
+
+            self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
+
+            productive_cells_num = len(productive_cells)
+
+            TRA_chain = results[results['locus'] == 'A']
+            TRA_chain_num = TRA_chain.shape[0]
+            TRB_chain = results[results['locus'] == 'B']
+            TRB_chain_num = TRB_chain.shape[0]
+
+            TRAs, TRBs = [], []
+            paired_cell = 0
+            for cell in productive_cells:
+                tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
+                if tmp1.empty is not True:
+                    chainA = tmp1['CDR3aa'].tolist()[0]
+                    TRAs.append(chainA)
+                else:
+                    TRAs.append('NaN')
+                
+                tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
+                if tmp2.empty is not True:
+                    chainB = tmp2['CDR3aa'].tolist()[0]
+                    TRBs.append(chainB)
+                else:
+                    TRBs.append('NaN')
+                
+                if not tmp1.empty and not tmp2.empty:
+                    paired_cell += 1
+
+            clonetypes_table = pd.DataFrame()
+            clonetypes_table['TRA_chain'] = TRAs
+            clonetypes_table['TRB_chain'] = TRBs
+            clonetypes_table['Frequency'] = ''
+
+            clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
+
+            sum_c = clonetypes['Frequency'].sum()
+            proportions = []
+            for f in list(clonetypes['Frequency']):
+                p = f/sum_c
+                p = p * 100
+                p = round(p, 2)
+                p = str(p) + '%'
+                proportions.append(p)
+            clonetypes['Proportion'] = proportions
+            clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
+            clonetypes = clonetypes.reset_index()
+
+            clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+            clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
+
+            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+
+            vdj_sum_summary.append({
+                'item': 'Estimated Number of Cells',
+                'count': productive_cells_num,
+                'total_count': np.nan,
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Cells with TRA',
+                'count': TRA_chain_num,
+                'total_count': productive_cells_num,
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Cells with TRB',
+                'count': TRB_chain_num,
+                'total_count': productive_cells_num,
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Cells with paired TRA and TRB',
+                'count': paired_cell,
+                'total_count': productive_cells_num,
+            })
+
+            with open(f'{ass_dir}/tmp.txt', 'r') as f:
+                medians = []
+                for line in f:
+                    line = line.rstrip('\n').split(':')
+                    medians.append(int(line[1]))
+
+                vdj_sum_summary.append({
+                    'item': 'Median UMIs per cell',
+                    'count': medians[0],
+                    'total_count': np.nan
+                })
+
+                vdj_sum_summary.append({
+                    'item': 'Median TRA UMIs per cell',
+                    'count': medians[1],
+                    'total_count': np.nan    
+                })
+
+                vdj_sum_summary.append({
+                    'item': 'Median TRB UMIs per cell',
+                    'count': medians[2],
+                    'total_count': np.nan
+                })
+
+
+        elif Seqtype == 'BCR':
+
+            productive_cells = set(results['CELL'].tolist())
+
+            productive_cells_num = len(productive_cells)
+
+            count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
+
+            count_umi.to_csv(count_umi_file, sep='\t')        
+
+            self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
+
+            results_h = results[results['LOCUS'] == 'H']
+            results_k = results[results['LOCUS'] == 'K']
+            results_l = results[results['LOCUS'] == 'L']
+            results_h_count = results_h.shape[0]
+            results_k_count = results_k.shape[0]
+            results_l_count = results_l.shape[0]
+
+            IGHs, IGKs, IGLs = [], [], []
+
+            paired_k, paired_l = 0, 0
+
+            for cell in productive_cells:
+                tmp1 = results_h[results_h['CELL'] == cell]
+                if tmp1.empty is not True:
+                    seq = tmp1['JUNCTION'].tolist()[0]
+                    seq = Seq(seq)
+                    aaseq = seq.translate()
+                    IGHs.append(aaseq)
+                else:
+                    IGHs.append('NaN')
+
+                tmp2 = results_l[results_l['CELL'] == cell]
+                if tmp2.empty is not True:
+                    seq = tmp2['JUNCTION'].tolist()[0]
+                    seq = Seq(seq)
+                    aaseq = seq.translate()
+                    IGLs.append(aaseq)
+                else:
+                    IGLs.append('NaN')
+
+                tmp3 = results_k[results_k['CELL'] == cell]
+                if tmp3.empty is not True:
+                    seq = tmp3['JUNCTION'].tolist()[0]
+                    seq = Seq(seq)
+                    aaseq = seq.translate()
+                    IGKs.append(aaseq)
+                else:
+                    IGKs.append('NaN')
+
+                if not tmp1.empty and not tmp2.empty:
+                    paired_l += 1
+                if not tmp1.empty and not tmp3.empty:
+                    paired_k += 1
+
+            clonetypes_table = pd.DataFrame()
+
+            clonetypes_table['IGH_chain'] = IGHs
+            clonetypes_table['IGL_chain'] = IGLs
+            clonetypes_table['IGK_chain'] = IGKs
+            clonetypes_table['Frequency'] = ''
+
+            clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
+
+            Proportion = []
+            sum_c = clonetypes['Frequency'].sum()
+            for f in list(clonetypes['Frequency']):
+                p = f/sum_c
+                p = p * 100
+                p = round(p, 2)
+                p = str(p) + '%'
+                Proportion.append(p)
+            clonetypes['Proportion'] = Proportion
+            clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
+            clonetypes = clonetypes.reset_index()
+
+            clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+            clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
+            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+
+
+            vdj_sum_summary.append({
+                    'item': 'Estimated Number of Cells',
+                    'count': productive_cells_num,
+                    'total_count': np.nan
+            })
+
+            vdj_sum_summary.append({
+                    'item': 'Cells with IGH',
+                    'count': results_h_count,
+                    'total_count': productive_cells_num
+            })    
+
+            vdj_sum_summary.append({
+                    'item': 'Cells with IGK',
+                    'count': results_k_count,
+                    'total_count': productive_cells_num
+            })
+
+            vdj_sum_summary.append({
+                    'item': 'Cells with IGL',
+                    'count': results_l_count,
+                    'total_count': productive_cells_num
+            })            
+
+            vdj_sum_summary.append({
+                    'item': 'Cells with paired IGH and IGK',
+                    'count': paired_k,
+                    'total_count': productive_cells_num
+            })
+
+            vdj_sum_summary.append({
+                    'item': 'Cells with paired IGH and IGL',
+                    'count': paired_l,
+                    'total_count': productive_cells_num
+            })
+
+            with open(f'{ass_dir}/tmp.txt', 'r') as f:
+                medians=[]
+                for line in f:
+                    line = line.strip('\n').split(':')
+                    medians.append(int(line[1]))
+
+                vdj_sum_summary.append({
+                    'item': 'Median UMIs per cell',
+                    'count': medians[0],
+                    'total_count': np.nan
+                })
+
+                vdj_sum_summary.append({
+                    'item': 'Median IGH UMIs per cell',
+                    'count': medians[1],
+                    'total_count': np.nan
+                })
+
+                vdj_sum_summary.append({
+                    'item': 'Median IGK UMIs per cell',
+                    'count': medians[2],
+                    'total_count': np.nan
+                })
+
+                vdj_sum_summary.append({
+                    'item': 'Median IGL UMIs per cell',
+                    'count': medians[3],
+                    'total_count': np.nan
+                })
+
+        df = pd.DataFrame(vdj_sum_summary, 
+            columns=['item', 'count', 'total_count'])
+
+        utils.gen_stat(df, stat_file)
+
+    # clonetype table
+
+        title = 'Clonetypes'
+        table_dict = self.get_table(title, 'clonetypes_table', clonetypes)
+
+        self.add_data_item(table_dict=table_dict)
+
+        self.clean_up()
+
+        os.remove(f'{ass_dir}/tmp.txt')
 
 
 @utils.add_log
 def vdj_sum(args):
-	step_name = 'vdj_sum'
-	vdj_sum_obj = Vdj_sum(args, step_name)
-	vdj_sum_obj.run()
+    step_name = 'vdj_sum'
+    vdj_sum_obj = Vdj_sum(args, step_name)
+    vdj_sum_obj.run()
 
 
 def get_opts_vdj_sum(parser, sub_program):
-	if sub_program:
-		parser = s_common(parser)
-		parser.add_argument('--ass_dir', help='assemble dir', required=True)
-		parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
-	parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    if sub_program:
+        parser = s_common(parser)
+        parser.add_argument('--ass_dir', help='assemble dir', required=True)
+        parser.add_argument('--fastq_dir', help='dir contains fastq', required=True)
+    parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
 
 
 
-- 
Gitee


From 06c908d2277f597367921e8f905699f84f3abe19 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Wed, 9 Jun 2021 18:38:52 +0800
Subject: [PATCH 34/96] rewrite map count by bowtie2

---
 celescope/tracer_vdj/go_assemble.py | 152 +++++++++++++---------------
 1 file changed, 69 insertions(+), 83 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 06ae5371..f521a8c0 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -45,101 +45,87 @@ def get_umi_count(fq):
 
 
 @utils.add_log
-def assemble_summary(outdir, Seqtype):
-    # UMIs = pd.read_csv(count_file, sep='\t')
+def assemble_summary(outdir, Seqtype, sample, species):
     
     stat_file = outdir + '/stat.txt'
 
     go_assemble_summary = []
 
-    if Seqtype == 'TCR':
-        TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq')
-        TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq')
-        TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
-        TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
-        TRA_UMIs_count = sum(TRA_UMIs)
-        medianA = int(np.median(TRA_UMIs))
-        TRB_UMIs_count = sum(TRB_UMIs)
-        medianB = int(np.median(TRB_UMIs))
-
-        all_umi_count = TRA_UMIs + TRB_UMIs
-        medianAll = int(np.median(all_umi_count))
-
-        totals = TRA_UMIs_count + TRB_UMIs_count
-
-        go_assemble_summary.append({
-            'item': 'All UMIs mapped to TRA and TRB',
-            'count': totals,
-            'total_count': np.nan, 
-        })
-
-        go_assemble_summary.append({
-            'item': 'UMIs mapped to TRA',
-            'count': TRA_UMIs_count,
-            'total_count': totals,
-        })
-
-        go_assemble_summary.append({
-            'item': 'UMIs mapped to TRB',
-            'count': TRB_UMIs_count,
-            'total_count': totals,
-        })
+    clean_fq = f'{outdir}/../02.cutadapt/{sample}__clean_2.fq'
 
-        with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Madian UMIs per cell:{medianAll}\n')
-            f.write(f'Median TRA UMIs per cell:{medianA}\n')
-            f.write(f'Median TRB UMIs per cell:{medianB}\n')
+    count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt'
 
-    elif Seqtype == 'BCR':
-        IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
-        IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
-        IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
-
-        IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
-        IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
-        IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
-
-        all_umi_count = IGH_UMIs + IGL_UMIs + IGK_UMIs
-        medianAll = int(np.median(all_umi_count))
-
-        IGH = sum(IGH_UMIs)
-        medianH = int(np.median(IGH_UMIs))
-        IGK = sum(IGK_UMIs)
-        medianK = int(np.median(IGK_UMIs))
-        IGL = sum(IGL_UMIs)
-        medianL = int(np.median(IGL_UMIs))
-
-        totals = IGH + IGK + IGL
-
-        go_assemble_summary.append({
-            'item': 'All UMIs mapped to IGH, IGL and IGK',
-            'count': totals,
-            'total_count': np.nan,            
-        })
+    count_df = pd.read_csv(count_file, sep='\t')
 
-        go_assemble_summary.append({
-            'item': 'UMIs mapped to IGH',
-            'count': IGH,
-            'total_count': totals,
-        })
+    total_count = count_df['readcount'].sum()
 
-        go_assemble_summary.append({
-            'item': 'UMIs mapped to IGK',
-            'count': IGK,
-            'total_count': totals,
+    if Seqtype == 'TCR':
+        loci = ['A', 'B']
+
+        total_mapped = 0
+
+        for locus in loci:
+            cmd = (
+                f'source activate {BRACER_CONDA}; '
+                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+                f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} '
+                f'-U {clean_fq} '
+                f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1'
+            )
+            os.system(cmd)
+            with open(f'{outdir}/log') as fh:
+                for line in fh:
+                    if 'aligned exactly 1 time' in line:
+                        res = re.findall("\d+", line)
+                        item = f'Reads mapped to TR{locus}'
+                        count = int(res[0])
+                        total_mapped += count
+                        go_assemble_summary.append({
+                            'item': item,
+                            'count': count,
+                            'total_count': total_count,
+                        })
+
+            os.system(f'rm {outdir}/TR{locus}.sam')
+
+        go_assemble_summary.insert(0, {
+            'item': 'All reads Mapped to TRA and TRB',
+            'count': total_mapped,
+            'total_count': total_count
         })
 
-        go_assemble_summary.append({
-            'item': 'UMIs mapped to IGL',
-            'count': IGL,
-            'total_count': totals,
+    elif Seqtype == 'BCR':
+        loci = ['H', 'L', 'K']
+
+        total_mapped = 0
+
+        for locus in loci:
+            cmd = (
+                f'source activate {BRACER_CONDA}; '
+                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+                f'-x /SGRNJ03/randd/zhouxin/software/bracer/resources/{species}/combinatorial_recombinomes/BCR_{locus} '
+                f'-U {clean_fq} '
+                f'-S {outdir}/BR{locus}.sam > {outdir}/log 2>&1'
+            )
+            os.system(cmd)
+            with open(f'{outdir}/log') as fh:
+                for line in fh:
+                    if 'aligned exactly 1 time' in line:
+                        res = re.findall("\d+", line)
+                        item = f'Reads mapped to BR{locus}'
+                        count = int(res[0])
+                        total_mapped += count
+                        go_assemble_summary.append({
+                            'item': item,
+                            'count': count,
+                            'total_count': total_count,
+                        })
+            os.system(f'rm {outdir}/BR{locus}.sam')
+        go_assemble_summary.insert(0, {
+            'item': 'All reads Mapped to IGH, IGL and IGK',
+            'count': total_mapped,
+            'total_count': total_count
         })
-
-        with open(f'{outdir}/tmp.txt', 'w') as f:
-            f.write(f'Median UMIs per cell:{medianAll}\n')
-            f.write(f'Median IGH UMIs per Cell:{medianH}\n')
-            f.write(f'Median IGK UMIs per Cell:{medianK}\n') 
-            f.write(f'Median IGL UMIs per Cell:{medianL}\n')
             
     df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
 
-- 
Gitee


From b57d7533ad84a55aab1b24e9101f9f36787091a7 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 10 Jun 2021 15:39:22 +0800
Subject: [PATCH 35/96] add bowtie2 map

---
 celescope/tracer_vdj/go_assemble.py |  44 +++--------
 celescope/tracer_vdj/split_fastq.py |   2 +-
 celescope/tracer_vdj/vdj_sum.py     | 116 ++++++++++++++++------------
 3 files changed, 78 insertions(+), 84 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index f521a8c0..452eb54d 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -5,9 +5,6 @@ from os import listdir
 from os.path import isfile, join
 from concurrent.futures import ProcessPoolExecutor
 from celescope.tools import utils
-import glob
-import pysam
-import numpy as np
 from celescope.tools.Step import Step, s_common
 
 
@@ -18,32 +15,6 @@ BRACER_CONDA = 'bracer'
 BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf'
 
 
-def gen_stat(summary, stat_file):
-    stat = summary
-    stat["new_count"] = stat["count"].astype(str) + stat["percent_str"]
-    stat = stat.loc[:, ["item", "new_count"]]
-    stat.to_csv(stat_file, sep=":", header=None, index=False)
-
-
-def percent_str_func(row):
-    need_percent = bool(re.search("Cells with", row["item"], flags=re.IGNORECASE))
-    if need_percent:
-        return "(" + str(row["percent"]) + "%)"
-    else:
-        return ""
-
-
-def get_umi_count(fq):
-    umis = []
-    with pysam.FastxFile(fq) as fh:
-        for entry in fh:
-            attr = entry.name.split('_')
-            umi = attr[1]
-            umis.append(umi)
-    res = len(set(umis))
-    return res
-
-
 @utils.add_log
 def assemble_summary(outdir, Seqtype, sample, species):
     
@@ -51,7 +22,7 @@ def assemble_summary(outdir, Seqtype, sample, species):
 
     go_assemble_summary = []
 
-    clean_fq = f'{outdir}/../02.cutadapt/{sample}__clean_2.fq'
+    clean_fq = f'{outdir}/../02.cutadapt/{sample}_clean_2.fq'
 
     count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt'
 
@@ -76,7 +47,7 @@ def assemble_summary(outdir, Seqtype, sample, species):
             with open(f'{outdir}/log') as fh:
                 for line in fh:
                     if 'aligned exactly 1 time' in line:
-                        res = re.findall("\d+", line)
+                        res = re.findall(r"\d+", line)
                         item = f'Reads mapped to TR{locus}'
                         count = int(res[0])
                         total_mapped += count
@@ -94,6 +65,8 @@ def assemble_summary(outdir, Seqtype, sample, species):
             'total_count': total_count
         })
 
+        os.system(f'rm {outdir}/log')
+
     elif Seqtype == 'BCR':
         loci = ['H', 'L', 'K']
 
@@ -111,7 +84,7 @@ def assemble_summary(outdir, Seqtype, sample, species):
             with open(f'{outdir}/log') as fh:
                 for line in fh:
                     if 'aligned exactly 1 time' in line:
-                        res = re.findall("\d+", line)
+                        res = re.findall(r"\d+", line)
                         item = f'Reads mapped to BR{locus}'
                         count = int(res[0])
                         total_mapped += count
@@ -126,7 +99,8 @@ def assemble_summary(outdir, Seqtype, sample, species):
             'count': total_mapped,
             'total_count': total_count
         })
-            
+        os.system(f'rm {outdir}/log')
+
     df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
 
     utils.gen_stat(df, stat_file)
@@ -231,7 +205,7 @@ class Go_assemble(Step):
 
         tracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.Seqtype)
+        assemble_summary(self.outdir, self.Seqtype, self.sample, self.species)
 
 
     def run_bracer(self):
@@ -248,7 +222,7 @@ class Go_assemble(Step):
 
         bracer_summarise(self.outdir)
 
-        assemble_summary(self.outdir, self.Seqtype)
+        assemble_summary(self.outdir, self.Seqtype, self.sample, self.species)
 
     @utils.add_log
     def run(self):
diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py
index a0b7973e..81d372d1 100755
--- a/celescope/tracer_vdj/split_fastq.py
+++ b/celescope/tracer_vdj/split_fastq.py
@@ -140,7 +140,7 @@ class Split_fastq(Step):
             i += 1
             
         df_f['cell_name'].fillna(0, inplace=True)
-        
+        df_f.fillna(0, inplace=True) 
         df_f = df_f.astype(int)
         df_f.to_csv(self.count_file, sep='\t')
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 60d86d46..d3896db0 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -5,6 +5,19 @@ import numpy as np
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
 from celescope.tools.cellranger3 import get_plot_elements
+import glob
+import pysam
+
+
+def get_umi_count(fq):
+    umis = []
+    with pysam.FastxFile(fq) as fh:
+        for entry in fh:
+            attr = entry.name.split('_')
+            umi = attr[1]
+            umis.append(umi)
+    res = len(set(umis))
+    return res
 
 
 def tpm_count(ass_dir):
@@ -113,6 +126,8 @@ class Vdj_sum(Step):
 
         count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
 
+        median_all = int(count_umi['UMI'].median())
+
         if Seqtype == 'TCR':
 
             productive_cells = set(results['cell_name'].tolist())
@@ -198,29 +213,31 @@ class Vdj_sum(Step):
                 'total_count': productive_cells_num,
             })
 
-            with open(f'{ass_dir}/tmp.txt', 'r') as f:
-                medians = []
-                for line in f:
-                    line = line.rstrip('\n').split(':')
-                    medians.append(int(line[1]))
+            TRAs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_A.fastq')
+            TRBs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_B.fastq')
+            TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
+            TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
 
-                vdj_sum_summary.append({
-                    'item': 'Median UMIs per cell',
-                    'count': medians[0],
-                    'total_count': np.nan
-                })
+            medianA = int(np.median(TRA_UMIs))
+            medianB = int(np.median(TRB_UMIs))         
 
-                vdj_sum_summary.append({
-                    'item': 'Median TRA UMIs per cell',
-                    'count': medians[1],
-                    'total_count': np.nan    
-                })
+            vdj_sum_summary.append({
+                'item': 'Median UMIs per cell',
+                'count': median_all,
+                'total_count': np.nan
+            })
 
-                vdj_sum_summary.append({
-                    'item': 'Median TRB UMIs per cell',
-                    'count': medians[2],
-                    'total_count': np.nan
-                })
+            vdj_sum_summary.append({
+                'item': 'Median TRA UMIs per cell',
+                'count': medianA,
+                'total_count': np.nan    
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Median TRB UMIs per cell',
+                'count': medianB,
+                'total_count': np.nan
+            })
 
 
         elif Seqtype == 'BCR':
@@ -341,35 +358,41 @@ class Vdj_sum(Step):
                     'total_count': productive_cells_num
             })
 
-            with open(f'{ass_dir}/tmp.txt', 'r') as f:
-                medians=[]
-                for line in f:
-                    line = line.strip('\n').split(':')
-                    medians.append(int(line[1]))
+            IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
+            IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
+            IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
 
-                vdj_sum_summary.append({
-                    'item': 'Median UMIs per cell',
-                    'count': medians[0],
-                    'total_count': np.nan
-                })
+            IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
+            IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
+            IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
 
-                vdj_sum_summary.append({
-                    'item': 'Median IGH UMIs per cell',
-                    'count': medians[1],
-                    'total_count': np.nan
-                })
+            medianH = int(np.median(IGH_UMIs))
+            medianL = int(np.median(IGL_UMIs))
+            medianK = int(np.median(IGK_UMIs))
 
-                vdj_sum_summary.append({
-                    'item': 'Median IGK UMIs per cell',
-                    'count': medians[2],
-                    'total_count': np.nan
-                })
+            vdj_sum_summary.append({
+                'item': 'Median UMIs per cell',
+                'count': median_all,
+                'total_count': np.nan
+            })
 
-                vdj_sum_summary.append({
-                    'item': 'Median IGL UMIs per cell',
-                    'count': medians[3],
-                    'total_count': np.nan
-                })
+            vdj_sum_summary.append({
+                'item': 'Median IGH UMIs per cell',
+                'count': medianH,
+                'total_count': np.nan
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Median IGL UMIs per cell',
+                'count': medianL,
+                'total_count': np.nan
+            })
+
+            vdj_sum_summary.append({
+                'item': 'Median IGK UMIs per cell',
+                'count': medianK,
+                'total_count': np.nan
+            })
 
         df = pd.DataFrame(vdj_sum_summary, 
             columns=['item', 'count', 'total_count'])
@@ -385,9 +408,6 @@ class Vdj_sum(Step):
 
         self.clean_up()
 
-        os.remove(f'{ass_dir}/tmp.txt')
-
-
 @utils.add_log
 def vdj_sum(args):
     step_name = 'vdj_sum'
-- 
Gitee


From 5a1b96f42e19d8718f63432e42cbb5ac0a33fe43 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 10 Jun 2021 19:33:54 +0800
Subject: [PATCH 36/96] add trust assemble

---
 celescope/__init__.py                        |   3 +-
 celescope/templates/html/trust_vdj/base.html | 161 +++++++++++++++++++
 celescope/trust_vdj/__init__.py              |   6 +
 celescope/trust_vdj/multi_trust_vdj.py       |  35 ++++
 celescope/trust_vdj/res_filter.py            |  87 ++++++++++
 celescope/trust_vdj/trust_assemble.py        | 130 +++++++++++++++
 6 files changed, 421 insertions(+), 1 deletion(-)
 create mode 100644 celescope/templates/html/trust_vdj/base.html
 create mode 100644 celescope/trust_vdj/__init__.py
 create mode 100644 celescope/trust_vdj/multi_trust_vdj.py
 create mode 100644 celescope/trust_vdj/res_filter.py
 create mode 100644 celescope/trust_vdj/trust_assemble.py

diff --git a/celescope/__init__.py b/celescope/__init__.py
index 015f41d1..d1f25d8b 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -14,5 +14,6 @@ ASSAY_DICT = {
     'tag': 'Single Cell tag',
     'citeseq': 'Single Cell CITE-Seq',
     'tcr_fl': 'Single Cell full length TCR',
-    'tracer_vdj': 'Single Cell Full Length TCR or BCR'
+    'tracer_vdj': 'Single Cell Full Length vdj assemble',
+    'trust_vdj': 'Single Cell Full Length vdj assemble'
 }
diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html
new file mode 100644
index 00000000..5318bb34
--- /dev/null
+++ b/celescope/templates/html/trust_vdj/base.html
@@ -0,0 +1,161 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta http-equiv="X-UA-Compatible" content="IE=edge,Chrome=1" />
+        <title>report</title>
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css"> 
+        <script> {% include "./js/jquery.min.js" %} </script>
+        <script> {% include "./js/plotly-latest.min.js" %} </script>
+
+        
+        <script type="text/javascript">
+            
+            function toggle1(node){
+                var helpNode = node.parentNode.parentNode.lastElementChild.firstElementChild;
+                if (helpNode.style.display=="inline"){
+                    helpNode.style.display="none";
+                } else {
+                    helpNode.style.display="inline";
+                }
+            }
+
+        </script>
+        
+        <style>
+            .box {
+                margin-top: 2px;
+                margin-bottom: 2px;
+                width: 100%;
+                background-color: rgba(255,255,255,0.5);
+                padding: 15px 14px;
+                border-radius: 5px;
+                border: 1px solid rgba(0,0,0,0.1);
+  
+            }
+
+            .box1 {
+                margin-top: 2px;
+                margin-bottom: 2px;
+                width: 100%;
+                background-color: rgba(255,255,255,0.5);
+                padding: 15px 14px;
+                border-radius: 5px;
+                border: 1px solid rgba(0,0,0,0.1);  
+            }
+
+            
+            .box1 table th{
+                text-align: left;
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            table {
+                border-collapse: collapse;
+                width: 100%;
+                margin-bottom: 1%;
+                border-spacing: 0 2em;
+            }
+
+            th, td {
+                border-bottom: 1px solid #ddd;
+                
+            }
+
+            .box table td:first-child{
+                text-align: left;
+                padding-bottom: 5px;
+            }
+
+            .box table td{
+                text-align: right;
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            .logo_container {
+                height: 50px;
+                width: 150px;
+                position: relative;
+            }
+            .description {
+                margin-top: 5px;
+            }
+            .description p{
+                text-align: left;
+                margin-top: 0px;
+                margin-bottom: 0px;
+                line-height: 28px !important;
+                font-size: 18px !important;
+            }
+            .has_desc:after {
+                position: relative;
+                background: rgba(0,0,0,0.1);
+                color: white;
+                width: 18px;
+                height: 18px;
+                border-radius: 18px;
+                cursor: pointer;
+                float: right;
+                text-align: center;
+                line-height: 20px;
+                content: '?';
+            }
+
+            .clear{ clear:both}
+
+            .js-plotly-plot .plotly .modebar {
+                position: absolute;
+                top: 2px;
+                right: 2px;
+                z-index: 1001;
+                background: rgba(255, 255, 255, 0.7) none repeat scroll 0% 0%;
+            }
+
+        </style>
+</head>
+<body>
+  <header class="navbar navbar-fixed-top" style="background: #ffffff">
+    <div class="navbar-brand">
+      {% include "html/common/logo.html"%}
+    </div>
+  </header>
+
+  <div class="container">
+      <h1 align="center"> CeleScope Report</h1>
+
+      {% if sample_summary is defined %}
+      {% include "html/common/sample_summary.html"%}
+      {% endif %}
+
+      {% if barcode_summary is defined %}
+      {% include "html/common/barcode_summary.html"%}
+      {% endif %}
+
+      {% if cutadapt_summary is defined %}
+      {% include "html/common/cutadapt_summary.html"%}
+      {% endif %}
+
+      {% if split_fastq is defined %}
+      {% include "html/tracer_vdj/split_fastq_summary.html"%}
+      {% endif %}
+
+      {% if go_assemble_summary is defined %}
+      {% include "html/tracer_vdj/go_assemble_summary.html"%}
+      {% endif %}
+
+      {% if vdj_sum_summary is defined %}
+      {% include "html/tracer_vdj/vdj_sum_summary.html"%}
+      {% endif %}
+      
+      {% if table_dict is defined %}
+      {% include "html/vdj/clonetypes_table.html"%}
+      {% endif %}
+      
+
+    </div>
+  </div>
+
+</body>
+</html>
\ No newline at end of file
diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py
new file mode 100644
index 00000000..69aeb6f7
--- /dev/null
+++ b/celescope/trust_vdj/__init__.py
@@ -0,0 +1,6 @@
+__STEPS__ = [
+    'sample',
+    'barcode',
+    'trust_assemble',
+    'res_filter']
+__ASSAY__ = 'trust_vdj'
diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py
new file mode 100644
index 00000000..93ecdedb
--- /dev/null
+++ b/celescope/trust_vdj/multi_trust_vdj.py
@@ -0,0 +1,35 @@
+from celescope.trust_vdj.__init__ import __ASSAY__
+from celescope.tools.Multi import Multi
+
+
+class Multi_trust_vdj(Multi):
+
+    def trust_assemble(self, sample):
+        step = 'trust_assemble'
+        cmd_line = self.get_cmd_line(step, sample)
+        fq1 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R1.fq{self.fq_suffix}'
+        fq2 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R2.fq{self.fq_suffix}' 
+        cmd = (
+            f'{cmd_line} '
+            f'--fq1 {fq1} '
+            f'--fq2 {fq2} '
+            f'--match_dir {self.col4_dict[sample]}'
+        )
+        self.process_cmd(cmd, step, sample, m=15, x=self.args.thread)
+
+
+    def res_filter(self, sample):
+        step = 'res_filter'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=1)
+
+
+def main():
+    multi = Multi_trust_vdj(__ASSAY__)
+    multi.run()
+
+if __name__ == '__main__':
+    main()
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
new file mode 100644
index 00000000..ce46c374
--- /dev/null
+++ b/celescope/trust_vdj/res_filter.py
@@ -0,0 +1,87 @@
+import pandas as pd
+from celescope.tools.Step import Step, s_common
+from celescope.tools import utils
+
+
+@utils.add_log
+def beauty_res(outdir, barcode_report):
+    res = pd.read_csv(barcode_report, sep='\t')
+    rows = res.shape[0]
+    loci = ['A', 'B']
+    chians = ['chain2', 'chain1']
+    for l in range(len(loci)):
+        chain = chians[l]
+        locus = loci[l]
+
+        Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], []
+
+        for i in range(rows):
+            attr = res.loc[i, chain]
+            attrs = attr.split(',')
+            if len(attrs) == 10:
+                V, D, J, C, cdr3nt, cdr3aa, readcount, fl = attrs[0], attrs[1], attrs[2], attrs[3], attrs[4], attrs[5], attrs[6], attrs[-1]
+                Vgenes.append(V)
+                Dgenes.append(D)
+                Jgenes.append(J)
+                Cgenes.append(C)
+                cdr3nts.append(cdr3nt)
+                cdr3aas.append(cdr3aa)
+                readcounts.append(readcount)
+                fuls.append(fl)
+            elif len(attrs) != 10:
+                Vgenes.append('NAN')
+                Dgenes.append('NAN')
+                Jgenes.append('NAN')
+                Cgenes.append('NAN')
+                cdr3nts.append('NAN')
+                cdr3aas.append('NAN')
+                readcounts.append('NAN')
+                fuls.append('NAN')
+            
+        res[f'TR{locus}_V'] = Vgenes
+        res[f'TR{locus}_D'] = Dgenes
+        res[f'TR{locus}_J'] = Jgenes
+        res[f'TR{locus}_C'] = Cgenes
+        res[f'TR{locus}_cdr3nt'] = cdr3nts
+        res[f'TR{locus}_cdr3aa'] = cdr3aas
+        res[f'TR{locus}_readcount'] = readcounts
+        res[f'TR{locus}_fl'] = fuls
+
+    res.to_csv(f'{outdir}/new_barcode_report.tsv', sep='\t')
+
+    return res
+
+
+class Res_filter(Step):
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.outdir = args.outdir
+        self.sample = args.sample
+
+
+    @utils.add_log
+    def run(self):
+        barcode_report = f'{self.outdir}/../02.truse_assemble/TRUST4/{self.sample}_barcode_report.tsv'
+        res = beauty_res(self.outdir, barcode_report)
+        filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')]
+        fre = [''] * filtered.shape[0]
+        filtered.insert(filtered.shape[1], 'Frequent', fre)
+
+        clones = filtered.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'})
+        clones = clones.sort_values(by='Frequent', ascending=False)
+
+        clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t')
+
+
+@utils.add_log
+def res_filter(args):
+    step_name = 'res_filter'
+    res_filter_obj = Res_filter(args, step_name)
+    res_filter_obj.run()
+
+
+def get_opts_res_filter(parser, sub_program):
+	if sub_program:
+		parser = s_common(parser)
\ No newline at end of file
diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py
new file mode 100644
index 00000000..1f09ab19
--- /dev/null
+++ b/celescope/trust_vdj/trust_assemble.py
@@ -0,0 +1,130 @@
+import os
+from celescope.tools import utils
+from celescope.tools.Step import Step, s_common
+from celescope.tracer_vdj.split_fastq import get_barcodes
+from celescope.tools.barcode import *
+import pysam
+import pandas as pd
+
+
+TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4'
+
+
+def count_fq(fq1):
+    bcs, umis, names = [], [], []
+    count_df = pd.DataFrame()
+    with pysam.FastxFile(fq1) as fq:
+        for entry in fq:
+            attr = entry.sequence
+            cb = attr[:24]
+            umi = attr[24:]
+            name = entry.name
+            bcs.append(cb)
+            umis.append(umi)
+            names.append(name)
+    count_df['barcode'] = bcs
+    count_df['UMI'] = umis
+    count_df['seq_name'] = names
+    
+    return count_df
+
+@utils.add_log
+def match_barcodes(outdir, match_dir, Seqtype, fq1):
+    annotated_bcs = get_barcodes(match_dir, Seqtype)
+    bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode'])
+    count_df = count_fq(fq1)
+    df = pd.merge(bcs_df, count_df, on='barcode', how='inner')
+    seqnames = df['seq_name'].tolist()
+    seqlist = open(f'{outdir}/seqlist.txt', 'w')
+    for name in seqnames:
+        seqlist.write(str(name) + '\n')
+
+    count_df.to_csv(f'{outdir}/count.txt', sep='\t')
+    df.to_csv(f'{outdir}/matched_count.txt', sep='\t')
+    
+
+class Trust_assemble(Step):
+    """
+    Features
+
+    - Get fq file
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.outdir = args.outdir
+        self.match_dir = args.match_dir
+        self.Seqtype = args.Seqtype
+        self.fq1 = args.fq1
+        self.fq2 = args.fq2
+        self.sample = args.sample
+        self.species = args.species
+
+    
+    @utils.add_log
+    def getFqfile(self):
+        match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1)
+
+        cmd1 = (
+            f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R1.fq'
+        )
+        os.system(cmd1)
+
+        cmd2 = (
+            f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R2.fq'
+        )
+        os.system(cmd2)
+
+
+    @utils.add_log
+    def run(self):
+
+        self.getFqfile()
+
+        species = self.species
+
+        if species =='Mmus':
+            index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/GRCm38_bcrtcr.fa'
+            ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/mouse_IMGT+C.fa'
+        elif species == 'Hsap':
+            index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/hg38_bcrtcr.fa'
+            ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa'
+        cmd = (
+            f'{TRUST} -t {self.thread} '
+            f'-u {self.outdir}/{self.sample}_R2.fq '
+            f'--barcode {self.outdir}/{self.sample}_R1.fq '
+            f'--barcodeRange 0 23 + '
+            f'-f {index_file} '
+            f'--ref {ref} '
+            f'-o {self.sample} --od {self.outdir}/TRUST4' 
+        )
+
+        os.system(cmd)
+
+        os.remove(f'{self.outdir}/seqlist.txt')
+
+
+@utils.add_log
+def trust_assemble(args):
+    step_name = 'trust_assemble'
+    trust_assemble_obj = Trust_assemble(args, step_name)
+    trust_assemble_obj.run()
+
+
+def get_opts_trust_assemble(parser, sub_program):
+    if sub_program:
+        parser = s_common(parser)
+        parser.add_argument('--fq1', help='R1 reads from barcode step', required=True)
+        parser.add_argument('--fq2', help='R2 reads from barcode step', required=True)
+        parser.add_argument('--match_dir', help='match_dir', required=True)
+    parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+    parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)        
+
+
+
+
+
+
+
+
-- 
Gitee


From a43ab53fc1172a8cc25761c8c672169828e98210 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 11 Jun 2021 10:43:47 +0800
Subject: [PATCH 37/96] repo from zhouyiqi to singleron-RD

---
 setup.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/setup.py b/setup.py
index 963f7d92..64798773 100755
--- a/setup.py
+++ b/setup.py
@@ -20,10 +20,10 @@ setuptools.setup(
     version=__VERSION__,
     author="zhouyiqi",
     author_email="zhouyiqi@singleronbio.com",
-    description="GEXSCOPE Single cell analysis",
+    description="Single Cell Analysis Pipelines",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    url="https://github.com/zhouyiqi91/CeleScope",
+    url="https://github.com/singleron-RD/CeleScope",
     packages=setuptools.find_packages(),
     classifiers=[
         "Programming Language :: Python :: 3",
-- 
Gitee


From 32ac9d044e15b7b1f12fc1f2629fe19105faa256 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 11 Jun 2021 11:04:03 +0800
Subject: [PATCH 38/96] lint

---
 celescope/vdj/count_vdj.py | 70 ++++++++++++++++++++++----------------
 1 file changed, 40 insertions(+), 30 deletions(-)

diff --git a/celescope/vdj/count_vdj.py b/celescope/vdj/count_vdj.py
index 04b2c32e..414a5bf4 100755
--- a/celescope/vdj/count_vdj.py
+++ b/celescope/vdj/count_vdj.py
@@ -43,7 +43,8 @@ class Count_vdj(Step):
         if (not args.match_dir) or (args.match_dir == "None"):
             self.match_bool = False
         if self.match_bool:
-            self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file(args.match_dir)
+            self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file(
+                args.match_dir)
 
         # out files
         self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv"
@@ -59,7 +60,8 @@ class Count_vdj(Step):
         df_UMI_sum = df_UMI_count_filter.groupby(
             ['barcode'], as_index=False).agg({"UMI": "sum"})
         if (self.args.UMI_min == "auto"):
-            df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False)
+            df_UMI_sum_sorted = df_UMI_sum.sort_values(
+                ["UMI"], ascending=False)
             rank_UMI = df_UMI_sum_sorted.iloc[CELL_CALLING_RANK, :]["UMI"]
             UMI_min = int(rank_UMI / 10)
         else:
@@ -71,7 +73,7 @@ class Count_vdj(Step):
         df = df_UMI_sum.sort_values('UMI', ascending=False)
         self.add_data_item(CB_num=df[df['mark'] == 'CB'].shape[0])
         self.add_data_item(Cells=list(df.loc[df['mark'] == 'CB', 'UMI']))
-        self.add_data_item(UB_num= df[df['mark'] == 'UB'].shape[0])
+        self.add_data_item(UB_num=df[df['mark'] == 'UB'].shape[0])
         self.add_data_item(Background=list(df.loc[df['mark'] == 'UB', 'UMI']))
 
         cell_barcodes = set(df_UMI_cell.barcode)
@@ -81,7 +83,8 @@ class Count_vdj(Step):
             value=total_cell_number,
         )
 
-        df_cell = df_UMI_count_filter[df_UMI_count_filter.barcode.isin(cell_barcodes)]
+        df_cell = df_UMI_count_filter[df_UMI_count_filter.barcode.isin(
+            cell_barcodes)]
         return df_cell, cell_barcodes
 
     @utils.add_log
@@ -97,7 +100,7 @@ class Count_vdj(Step):
             ["barcode", "chain"], as_index=False).head(1)
         return df_confident
 
-    def get_df_valid_count(self,df_confident):
+    def get_df_valid_count(self, df_confident):
         df_valid_count = df_confident.set_index(["barcode", "chain"])
         df_valid_count = df_valid_count.unstack()
         df_valid_count.columns = ['_'.join(col) for col in df_valid_count]
@@ -105,14 +108,13 @@ class Count_vdj(Step):
         df_valid_count.fillna(inplace=True, value="NA")
         return df_valid_count
 
-    
     def get_clonetypes_and_write(self, df_valid_count, cell_barcodes):
         """
         Returns
         - df_clonetypes
         - df_match_clonetypes
         """
-        
+
         total_cell_number = len(cell_barcodes)
         df_clonetypes = df_valid_count.copy()
         df_match_clonetypes = None
@@ -121,7 +123,8 @@ class Count_vdj(Step):
             "barcode": "count"})
         # put na last
         df_clonetypes.replace('NA', np.nan, inplace=True)
-        df_clonetypes.sort_values(["barcode"] + self.cols, ascending=False, na_position='last', inplace=True)
+        df_clonetypes.sort_values(
+            ["barcode"] + self.cols, ascending=False, na_position='last', inplace=True)
         df_clonetypes.replace(np.nan, 'NA', inplace=True)
 
         total_CDR3_barcode_number = sum(df_clonetypes.barcode)
@@ -138,7 +141,8 @@ class Count_vdj(Step):
         # order
         order = ["clonetype_ID"] + self.cols + ["barcode", "percent"]
         df_clonetypes = df_clonetypes[order]
-        df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True)
+        df_clonetypes.rename(
+            columns={"barcode": "barcode_count"}, inplace=True)
         # out clonetypes
         df_clonetypes.to_csv(self.clonetypes_file, sep="\t", index=False)
 
@@ -194,11 +198,11 @@ class Count_vdj(Step):
                     total=total_cell_number
                 )
 
-
         # BCR
         elif self.args.type == "BCR":
 
-            UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"}
+            UMI_col_dic = {"IGH": "UMI_IGH",
+                           "IGL": "UMI_IGL", "IGK": "UMI_IGK"}
             for chain in UMI_col_dic:
                 UMI_col_name = UMI_col_dic[chain]
                 if UMI_col_name in df_valid_count.columns:
@@ -271,24 +275,26 @@ class Count_vdj(Step):
             df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply(
                 lambda x: round(x, 2)
             )
-            df_match_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True)
+            df_match_clonetypes.rename(
+                columns={"barcode": "barcode_count"}, inplace=True)
             df_match_clonetypes = df_match_clonetypes.merge(
                 df_clonetypes, on=self.cols, how='left', suffixes=('', '_y'))
             # order and drop duplicated cols
             order = ["clonetype_ID"] + self.cols + ["barcode_count", "percent"]
             df_match_clonetypes = df_match_clonetypes[order]
-            df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[False,True], inplace=True)
+            df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[
+                                            False, True], inplace=True)
             df_match_clonetypes.to_csv(
                 self.match_clonetypes_file, sep="\t", index=False)
         return df_clonetypes, df_match_clonetypes
 
-
     def write_cell_confident_count(self, df_valid_count, df_clonetypes, df_confident):
         df_mergeID = pd.merge(df_valid_count,
-                            df_clonetypes, how="left", on=self.cols)
+                              df_clonetypes, how="left", on=self.cols)
         df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True)
         # output df_valid_count
-        df_mergeID.to_csv(self.cell_confident_count_file, sep="\t", index=False)
+        df_mergeID.to_csv(self.cell_confident_count_file,
+                          sep="\t", index=False)
         df_mergeID = df_mergeID[["barcode", "clonetype_ID"]]
         df_cell_confident_with_ID = pd.merge(
             df_confident, df_mergeID, how="left", on="barcode")
@@ -298,9 +304,8 @@ class Count_vdj(Step):
         df_cell_confident_with_ID.to_csv(
             self.cell_confident_file, sep="\t", index=False)
 
-
     def write_clonetypes_table_to_data(self, df_clonetypes, df_match_clonetypes):
-                # cloneytpes table
+        # cloneytpes table
         def format_table(df_clonetypes):
             df_table = df_clonetypes.copy()
             df_table["percent"] = df_table["percent"].apply(
@@ -310,7 +315,8 @@ class Count_vdj(Step):
             for chain in self.chains:
                 for seq in seqs:
                     cols.append("_".join([seq, chain]))
-            df_table_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"]
+            df_table_cols = ["clonetype_ID"] + \
+                cols + ["barcode_count", "percent"]
             df_table = df_table[df_table_cols]
             table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"]
             return df_table, table_header
@@ -325,18 +331,21 @@ class Count_vdj(Step):
         self.add_data_item(table_dict=table_dict)
 
     def run(self):
-        df_UMI_count_filter = pd.read_csv(self.args.UMI_count_filter_file, sep='\t')
+        df_UMI_count_filter = pd.read_csv(
+            self.args.UMI_count_filter_file, sep='\t')
         df_cell, cell_barcodes = self.cell_calling(df_UMI_count_filter)
         df_confident = self.get_df_confident(df_cell)
         df_valid_count = self.get_df_valid_count(df_confident)
-        df_clonetypes, df_match_clonetypes = self.get_clonetypes_and_write(df_valid_count, cell_barcodes)
-        self.write_cell_confident_count(df_valid_count, df_clonetypes, df_confident)
+        df_clonetypes, df_match_clonetypes = self.get_clonetypes_and_write(
+            df_valid_count, cell_barcodes)
+        self.write_cell_confident_count(
+            df_valid_count, df_clonetypes, df_confident)
         self.write_clonetypes_table_to_data(df_clonetypes, df_match_clonetypes)
         self.clean_up()
 
 
 def count_vdj(args):
-    # TODO 
+    # TODO
     # add TCR or BCR prefix to distinguish them in html report summary; should improve
     step_name = f"{args.type}_count_vdj"
     count_vdj_obj = Count_vdj(args, step_name)
@@ -344,25 +353,26 @@ def count_vdj(args):
 
 
 def get_opts_count_vdj(parser, sub_program):
-    parser.add_argument("--type", help="Required. `TCR` or `BCR`. ", required=True)
+    parser.add_argument(
+        "--type", help="Required. `TCR` or `BCR`. ", required=True)
     parser.add_argument(
         '--UMI_min',
-        help='Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.', 
+        help='Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.',
         default="auto"
     )
     parser.add_argument(
-        '--iUMI', 
+        '--iUMI',
         help="""Default `1`. Minimum number of UMI of identical receptor type and CDR3. 
-For each (barcode, chain) combination, only UMI>=iUMI is considered valid.""", 
+For each (barcode, chain) combination, only UMI>=iUMI is considered valid.""",
         type=int,
         default=1
     )
     if sub_program:
-        parser.add_argument("--UMI_count_filter_file", help="Required. File from step mapping_vdj.", required=True)
+        parser.add_argument("--UMI_count_filter_file",
+                            help="Required. File from step mapping_vdj.", required=True)
         parser.add_argument(
-            "--match_dir", 
+            "--match_dir",
             help="Match celescope scRNA-Seq directory. ",
             default=None
         )
         parser = s_common(parser)
-
-- 
Gitee


From 7ee397d4cb6e0e163dba38f4a6f6b5b572e5b84f Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 11 Jun 2021 14:38:46 +0800
Subject: [PATCH 39/96] rm redundant codes

---
 celescope/rna/star.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/celescope/rna/star.py b/celescope/rna/star.py
index 2af181e2..3e9e1f4d 100755
--- a/celescope/rna/star.py
+++ b/celescope/rna/star.py
@@ -44,9 +44,9 @@ class Star_rna(Step, StarMixin):
         # parse
         self.refflat = f"{self.genomeDir}/{self.genome['refflat']}"
 
-        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
-        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
-        self.picard_region_log = f'{self.outdir}/{self.sample}_region.log'
+        self.ribo_log = f'{self.out_prefix}_ribo_log.txt'
+        self.ribo_run_log = f'{self.out_prefix}_ribo_run.log'
+        self.picard_region_log = f'{self.out_prefix}_region.log'
         self.plot = None
         self.stats = pd.Series()
 
@@ -113,9 +113,8 @@ class Star_rna(Step, StarMixin):
 
     @utils.add_log
     def ribo(self):
+        # TODO remove bbduk.sh and use picard ribo bases
         human_ribo_fa = f'{ROOT_PATH}/data/rRNA/human_ribo.fasta'
-        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
-        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
         cmd = (
             f'bbduk.sh '
             f'in1={self.fq} '
-- 
Gitee


From 84224f8510bb45f7606660358026d7d1aaff5343 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 11 Jun 2021 19:41:52 +0800
Subject: [PATCH 40/96] fix bug if one chain miss in all cells

---
 .../html/tracer_vdj/go_assemble_summary.html  |  14 +-
 .../html/tracer_vdj/vdj_sum_summary.html      |  14 +-
 celescope/tracer_vdj/go_assemble.py           |   8 +-
 celescope/tracer_vdj/vdj_sum.py               | 376 ++++++++----------
 celescope/trust_vdj/res_filter.py             |   2 +-
 5 files changed, 179 insertions(+), 235 deletions(-)

diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
index 5bb8c0bf..be42468a 100644
--- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html
+++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html
@@ -3,14 +3,14 @@
     <div class="box">
       <div class="description" style="display: none;">
         <p>If type is BCR:</p>
-        <p><b>All UMIs Mapped to IGH, IGL and IGK</b>: UMIs confidently mapped to IGH, IGL and IGK chain.</p>        
-        <p><b>UMIs Mapped to IGH</b>: UMIs confidently mapped to IGH chain.</p>
-        <p><b>UMIs Mapped to IGL</b>: UMIs confidently mapped to IGL chain.</p>
-        <p><b>UMIs Mapped to IGK</b>: UMIs confidently mapped to IGK chain.</p>
+        <p><b>All reads Mapped to IGH, IGL and IGK</b>: reads confidently mapped to IGH, IGL and IGK chain.</p>        
+        <p><b>reads Mapped to IGH</b>: reads confidently mapped to IGH chain.</p>
+        <p><b>reads Mapped to IGL</b>: reads confidently mapped to IGL chain.</p>
+        <p><b>reads Mapped to IGK</b>: reads confidently mapped to IGK chain.</p>
         <p>If type is TCR:</p>
-        <p><b>All UMIs Mapped to TRA and TRB</b>: UMIs confidently mapped to TRA and TRB chain.</p>        
-        <p><b>UMIs Mapped to TRA</b>: UMIs confidently mapped to TRA chain.</p>
-        <p><b>UMIs Mapped to TRB</b>: UMIs confidently mapped to TRB chain.</p>
+        <p><b>All reads Mapped to TRA and TRB</b>: reads confidently mapped to TRA and TRB chain.</p>        
+        <p><b>reads Mapped to TRA</b>: reads confidently mapped to TRA chain.</p>
+        <p><b>reads Mapped to TRB</b>: reads confidently mapped to TRB chain.</p>
     </div>
       <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
         {% for item in go_assemble_summary %}
diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
index 7b5fdb08..fc2e6d7c 100644
--- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
+++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html
@@ -9,18 +9,18 @@
       <p><b>Cells with IGL</b>: Cells with full length IGL.</p>
       <p><b>Cells with paired IGH and IGK</b>: Cells with paired IGH and IGK.</p>
       <p><b>Cells with paired IGH and IGL</b>: Cells with paired IGH and IGL.</p>
-      <p><b>Median UMIs per cell</b>: Median total UMIs per cell.</p>
-      <p><b>Median IGH UMIs per cell</b>: Median UMIs mapped to IGH.</p>
-      <p><b>Median IGK UMIs per cell</b>: Median UMIs mapped to IGK.</p>
-      <p><b>Median IGL UMIs per cell</b>: Median UMIs mapped to IGL.</p>
+      <p><b>Median read count per cell</b>: Median total read count per cell.</p>
+      <p><b>Median IGH read count per cell</b>: Median read count mapped to IGH.</p>
+      <p><b>Median IGK read count per cell</b>: Median read count mapped to IGK.</p>
+      <p><b>Median IGL read count per cell</b>: Median read count mapped to IGL.</p>
       <p>If type is TCR:</p>
       <p><b>Estimated Number of Cells</b>: Number of cells which contain full length TRA or TRB.</p>
       <p><b>Cells with TRA</b>: Cells with full length TRA.</p>
       <p><b>Cells with TRB</b>: Cells with full length TRB.</p>
       <p><b>Cells with paired TRA and TRB</b>: Cells with paired TRA and TRB.</p>
-      <p><b>Median UMIs per cell</b>: Median UMIs mapped to TRA and TRB.</p>      
-      <p><b>Median TRA UMIs per cell</b>: Median UMIs mapped to TRA.</p>
-      <p><b>Median TRB UMIs per cell</b>: Median UMIs mapped to TRB.</p>          
+      <p><b>Median read count per cell</b>: Median read count mapped to TRA and TRB.</p>      
+      <p><b>Median TRA read count per cell</b>: Median read count mapped to TRA.</p>
+      <p><b>Median TRB read count per cell</b>: Median read count mapped to TRB.</p>          
   </div>
     <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
       {% for item in vdj_sum_summary %}
diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 452eb54d..3a291d39 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -227,9 +227,13 @@ class Go_assemble(Step):
     @utils.add_log
     def run(self):
         if self.Seqtype == 'TCR':
-            self.run_tracer()
+            tracer_dir = f'{self.outdir}/tracer/filtered_TCRAB_summary/recombinants.txt'
+            if not os.path.exists(tracer_dir):
+                self.run_tracer()
         elif self.Seqtype == 'BCR':
-            self.run_bracer()
+            bracer_dir = f'{self.outdir}/bracer/filtered_BCR_summary/changeodb.tab'
+            if not os.path.exists(bracer_dir):
+                self.run_bracer()
 
         self.clean_up()
 
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index d3896db0..73772bc4 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -1,4 +1,5 @@
 import os
+from re import I
 import pandas as pd
 from Bio.Seq import Seq
 import numpy as np
@@ -9,17 +10,16 @@ import glob
 import pysam
 
 
-def get_umi_count(fq):
-    umis = []
+def get_read_count(fq):
+    count = 0
     with pysam.FastxFile(fq) as fh:
         for entry in fh:
-            attr = entry.name.split('_')
-            umi = attr[1]
-            umis.append(umi)
-    res = len(set(umis))
-    return res
+            count += 1
 
+    return count
 
+
+@utils.add_log
 def tpm_count(ass_dir):
     rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t')  
     # ass_dir outdir/sample/04.go_assemble
@@ -41,54 +41,79 @@ def tpm_count(ass_dir):
     return productive
 
 
+@utils.add_log
 def filtering(Seqtype, ass_dir, outdir):
     if not os.path.exists(outdir):
         os.makedirs(outdir)
 
     if Seqtype == 'TCR':
         data = tpm_count(ass_dir)
-        cell_name = set(list(data['cell_name']))
+        cell_name = list(set(list(data['cell_name']))).sort()
         filtered = pd.DataFrame()
-        for name in cell_name:
-            count_data = data[data['cell_name'] == name]
-            tra = count_data[count_data['locus'] == 'A']
-            trb = count_data[count_data['locus'] == 'B']
-            if tra.empty is not True:
-                tra = tra.sort_values(by='TPM', ascending=False)
-                tra = tra.head(1)
-                filtered = filtered.append(tra, ignore_index=True)
-            if trb.empty is not True:
-                trb = trb.sort_values(by='TPM', ascending=False)
-                trb = trb.head(1)
-                filtered = filtered.append(trb, ignore_index=True)
+        df = pd.DataFrame(cell_name, columns=['cell_name'])
+        loci = ['A', 'B']
+        for locus in loci:
+            tmp = data[data['locus']==locus]
+            tmp = tmp.sort_values(by='TPM', ascending=False)
+            tmp = tmp.drop_duplicates('cell_name', 'first')
+            filtered = filtered.append(tmp, ignore_index=True)
+
+            tmp = tmp.rename(columns={'CDR3aa': f'TR{locus}_CDR3aa'})
+            clones = tmp[['cell_name', f'TR{locus}_CDR3aa']]
+            df = pd.merge(df, clones, on='cell_name', how='outer')
+
+        df = df.fillna('None')
 
+        clonetypes = df.groupby(['TRA_CDR3aa', 'TRB_CDR3aa']).agg({'cell_name': 'count'})
+        clonetypes = clonetypes.sort_values(by='cell_name', ascending=False)
+        clonetypes = clonetypes.rename(columns={'cell_name': 'Frequency'})
+
+        clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
         filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
     elif Seqtype == 'BCR':
 
         data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
-        data = data[data['FUNCTIONAL'] == True]
-        cell_name = set(list(data['CELL']))
+        data = data[(data['FUNCTIONAL'] == True) & (data['IN_FRAME'] == True)]
+        cell_name = list(set(data['CELL'].tolist())).sort()
         filtered = pd.DataFrame()
-        for name in cell_name:
-            count_cell = data[data['CELL'] == name]
-            count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H'])
-            count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K'])
-            count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L'])
-            count_k_l = count_k.append(count_l)
-            if count_h.empty is not True:
-                count_h = count_h.sort_values(by='TPM', ascending=False)
-                count_h = count_h.head(1)
-                filtered = filtered.append(count_h, ignore_index=True)
-            if count_k_l.empty is not True:
-                count_k_l = count_k_l.sort_values(by='TPM', ascending=False)
-                count_k_l = count_k_l.head(1)
-                filtered = filtered.append(count_k_l, ignore_index=True)
 
+        tmp = data[data['LOCUS'] == 'H']
+        tmp = tmp.sort_values(by='TPM', ascending=False)
+        tmp = tmp.drop_duplicates('CELL', 'first')
+        filtered = filtered.append(tmp, ignore_index=True)
+
+        tmp2 = data[data['LOCUS'] != 'H']
+        tmp2 = tmp2.sort_values(by='TPM', ascending=False)
+        tmp2 = tmp2.drop_duplicates('CELL', 'first')
+        filtered = filtered.append(tmp2, ignore_index=True)
+
+        df = pd.DataFrame(cell_name, columns=['CELL'])
+
+        loci = ['H', 'L', 'K']
+        for locus in loci:
+            tmp = filtered[filtered['LOCUS'] == locus][['CELL', 'JUNCTION']]
+            tmp.columns = ['CELL', f'JUNCTION_{locus}']
+            ntseqs = tmp[f'JUNCTION_{locus}'].tolist()
+            tmplist = []
+            for nt in ntseqs:
+                nt = Seq(nt)
+                nt = nt.reverse_complement()
+                tmplist.append(str(nt))
+            tmp.insert(tmp.shape[1], f'IG{locus}_CDR3aa', tmplist)
+
+            df = pd.merge(df, tmp, on='CELL', how='outer')
+
+        df = df.fillna('None')
+            
+        clonetypes = df.groupby(['IGH_CDR3aa', 'IGL_CDR3aa', 'IGK_CDR3aa']).agg({'CELL': 'count'})
+        clonetypes = clonetypes.sort_values(by='CELL', ascending=False)
+        clonetypes = clonetypes.rename(columns={'CELL': 'Frequency'})
+
+        clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+        filtered = filtered.rename(columns={'CELL': 'cell_name'})
         filtered.to_csv(f'{outdir}/filtered.txt', sep='\t')
 
-    return filtered
-    
 
 class Vdj_sum(Step):
     """
@@ -116,62 +141,32 @@ class Vdj_sum(Step):
         fastq_dir = self.fastq_dir
         Seqtype = self.Seqtype
 
-        results = filtering(Seqtype, ass_dir, outdir)
+        filtering(Seqtype, ass_dir, outdir)
+
+        filter_data = pd.read_csv(f'{outdir}/filtered.txt', sep='\t')
 
         stat_file = outdir + '/stat.txt'
 
         vdj_sum_summary = []
         
         count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt'
-
         count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0)
-
         median_all = int(count_umi['UMI'].median())
 
-        if Seqtype == 'TCR':
+        clonetypes = pd.read_csv(f'{outdir}/clonetypes.tsv', sep='\t')
 
-            productive_cells = set(results['cell_name'].tolist())
+        productive_cells = set(filter_data['cell_name'].tolist())
+        productive_cells_num = len(productive_cells)
 
+        if Seqtype == 'TCR':
+            # barcode umi plot
             count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
 
             count_umi.to_csv(count_umi_file, sep='\t')
 
             self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
 
-            productive_cells_num = len(productive_cells)
-
-            TRA_chain = results[results['locus'] == 'A']
-            TRA_chain_num = TRA_chain.shape[0]
-            TRB_chain = results[results['locus'] == 'B']
-            TRB_chain_num = TRB_chain.shape[0]
-
-            TRAs, TRBs = [], []
-            paired_cell = 0
-            for cell in productive_cells:
-                tmp1 = TRA_chain[TRA_chain['cell_name'] == cell]
-                if tmp1.empty is not True:
-                    chainA = tmp1['CDR3aa'].tolist()[0]
-                    TRAs.append(chainA)
-                else:
-                    TRAs.append('NaN')
-                
-                tmp2 = TRB_chain[TRB_chain['cell_name'] == cell]
-                if tmp2.empty is not True:
-                    chainB = tmp2['CDR3aa'].tolist()[0]
-                    TRBs.append(chainB)
-                else:
-                    TRBs.append('NaN')
-                
-                if not tmp1.empty and not tmp2.empty:
-                    paired_cell += 1
-
-            clonetypes_table = pd.DataFrame()
-            clonetypes_table['TRA_chain'] = TRAs
-            clonetypes_table['TRB_chain'] = TRBs
-            clonetypes_table['Frequency'] = ''
-
-            clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'})
-
+            # clonetype table
             sum_c = clonetypes['Frequency'].sum()
             proportions = []
             for f in list(clonetypes['Frequency']):
@@ -184,10 +179,13 @@ class Vdj_sum(Step):
             clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
             clonetypes = clonetypes.reset_index()
 
-            clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-            clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion']))
+            clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+            clonetypes['TRA_CDR3aa'] = clonetypes.TRA_CDR3aa.apply(lambda x: 'C'+str(x)+'F' if x != 'None' else 'None')
+            clonetypes['TRB_CDR3aa'] = clonetypes.TRB_CDR3aa.apply(lambda x: 'C'+str(x)+'F' if x != 'None' else 'None')
 
-            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+            clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequency', 'Proportion']))
+
+            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t', index=None)
 
             vdj_sum_summary.append({
                 'item': 'Estimated Number of Cells',
@@ -195,17 +193,18 @@ class Vdj_sum(Step):
                 'total_count': np.nan,
             })
 
-            vdj_sum_summary.append({
-                'item': 'Cells with TRA',
-                'count': TRA_chain_num,
-                'total_count': productive_cells_num,
-            })
+            loci = ['A', 'B']
 
-            vdj_sum_summary.append({
-                'item': 'Cells with TRB',
-                'count': TRB_chain_num,
-                'total_count': productive_cells_num,
-            })
+            for locus in loci:
+                tmp = int(clonetypes[clonetypes[f'TR{locus}_CDR3aa'] != 'None']['Frequency'].sum())
+
+                vdj_sum_summary.append({
+                    'item': f'Cells with TR{locus}',
+                    'count': tmp,
+                    'total_count': productive_cells_num,
+                })
+
+            paired_cell = int(clonetypes[(clonetypes['TRA_CDR3aa'] != 'None') & (clonetypes['TRB_CDR3aa'] != 'None')]['Frequency'].sum())
 
             vdj_sum_summary.append({
                 'item': 'Cells with paired TRA and TRB',
@@ -213,97 +212,47 @@ class Vdj_sum(Step):
                 'total_count': productive_cells_num,
             })
 
-            TRAs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_A.fastq')
-            TRBs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_B.fastq')
-            TRA_UMIs = [get_umi_count(fq) for fq in TRAs]
-            TRB_UMIs = [get_umi_count(fq) for fq in TRBs]
-
-            medianA = int(np.median(TRA_UMIs))
-            medianB = int(np.median(TRB_UMIs))         
-
             vdj_sum_summary.append({
-                'item': 'Median UMIs per cell',
+                'item': 'Median read count per cell',
                 'count': median_all,
                 'total_count': np.nan
-            })
-
-            vdj_sum_summary.append({
-                'item': 'Median TRA UMIs per cell',
-                'count': medianA,
-                'total_count': np.nan    
-            })
+            })            
 
-            vdj_sum_summary.append({
-                'item': 'Median TRB UMIs per cell',
-                'count': medianB,
-                'total_count': np.nan
-            })
+            for locus in loci:
+                tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq')
+                if len(tmp) != 0:
+                    read_count = [get_read_count(fq) for fq in tmp]
+                    read_count.sort()
+                    for i in range(len(read_count)):
+                        if read_count[i] != 0:
+                            idx = i
+                            break
+                    read_count = read_count[idx:]
+                    median_tmp = int(np.median(read_count))
+                    vdj_sum_summary.append({
+                        'item': f'Median TR{locus} read count per cell',
+                        'count': median_tmp,
+                        'total_count': np.nan
+                    })
+                else:
+                    vdj_sum_summary.append({
+                    'item': f'Median TR{locus} read count per cell',
+                    'count': 0,
+                    'total_count': np.nan
+                    })
 
 
         elif Seqtype == 'BCR':
 
-            productive_cells = set(results['CELL'].tolist())
-
-            productive_cells_num = len(productive_cells)
-
+            # barcode umi plot
             count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB")
 
             count_umi.to_csv(count_umi_file, sep='\t')        
 
             self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file))
 
-            results_h = results[results['LOCUS'] == 'H']
-            results_k = results[results['LOCUS'] == 'K']
-            results_l = results[results['LOCUS'] == 'L']
-            results_h_count = results_h.shape[0]
-            results_k_count = results_k.shape[0]
-            results_l_count = results_l.shape[0]
-
-            IGHs, IGKs, IGLs = [], [], []
-
-            paired_k, paired_l = 0, 0
-
-            for cell in productive_cells:
-                tmp1 = results_h[results_h['CELL'] == cell]
-                if tmp1.empty is not True:
-                    seq = tmp1['JUNCTION'].tolist()[0]
-                    seq = Seq(seq)
-                    aaseq = seq.translate()
-                    IGHs.append(aaseq)
-                else:
-                    IGHs.append('NaN')
-
-                tmp2 = results_l[results_l['CELL'] == cell]
-                if tmp2.empty is not True:
-                    seq = tmp2['JUNCTION'].tolist()[0]
-                    seq = Seq(seq)
-                    aaseq = seq.translate()
-                    IGLs.append(aaseq)
-                else:
-                    IGLs.append('NaN')
-
-                tmp3 = results_k[results_k['CELL'] == cell]
-                if tmp3.empty is not True:
-                    seq = tmp3['JUNCTION'].tolist()[0]
-                    seq = Seq(seq)
-                    aaseq = seq.translate()
-                    IGKs.append(aaseq)
-                else:
-                    IGKs.append('NaN')
-
-                if not tmp1.empty and not tmp2.empty:
-                    paired_l += 1
-                if not tmp1.empty and not tmp3.empty:
-                    paired_k += 1
-
-            clonetypes_table = pd.DataFrame()
-
-            clonetypes_table['IGH_chain'] = IGHs
-            clonetypes_table['IGL_chain'] = IGLs
-            clonetypes_table['IGK_chain'] = IGKs
-            clonetypes_table['Frequency'] = ''
 
-            clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'})
+            # clone type table
 
             Proportion = []
             sum_c = clonetypes['Frequency'].sum()
@@ -317,9 +266,10 @@ class Vdj_sum(Step):
             clonetypes = clonetypes.sort_values(by='Frequency', ascending=False)
             clonetypes = clonetypes.reset_index()
 
-            clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-            clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion']))
-            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t')
+            clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+            clonetypes = clonetypes.reindex(columns=list(['CloneId', 'IGH_CDR3aa', 'IGL_CDR3aa', 'IGK_CDR3aa', 'Frequency', 'Proportion']))
+
+            clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t', index=None)
 
 
             vdj_sum_summary.append({
@@ -328,78 +278,68 @@ class Vdj_sum(Step):
                     'total_count': np.nan
             })
 
-            vdj_sum_summary.append({
-                    'item': 'Cells with IGH',
-                    'count': results_h_count,
-                    'total_count': productive_cells_num
-            })    
+            loci = ['H', 'L', 'K']
 
-            vdj_sum_summary.append({
-                    'item': 'Cells with IGK',
-                    'count': results_k_count,
-                    'total_count': productive_cells_num
-            })
+            for locus in loci:
+                tmp = int(clonetypes[clonetypes[f'IG{locus}_CDR3aa']!='None']['Frequency'].sum())
 
-            vdj_sum_summary.append({
-                    'item': 'Cells with IGL',
-                    'count': results_l_count,
-                    'total_count': productive_cells_num
-            })            
+                vdj_sum_summary.append({
+                        'item': f'Cells with IG{locus}',
+                        'count': tmp,
+                        'total_count': productive_cells_num
+                })
 
-            vdj_sum_summary.append({
-                    'item': 'Cells with paired IGH and IGK',
-                    'count': paired_k,
-                    'total_count': productive_cells_num
-            })
+            paired_H_L = int(clonetypes[(clonetypes['IGH_CDR3aa']!='None') & (clonetypes['IGL_CDR3aa']!='None')]['Frequency'].sum())
 
             vdj_sum_summary.append({
                     'item': 'Cells with paired IGH and IGL',
-                    'count': paired_l,
+                    'count': paired_H_L,
                     'total_count': productive_cells_num
             })
 
-            IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq')
-            IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq')
-            IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq')
-
-            IGH_UMIs = [get_umi_count(fq) for fq in IGHs]
-            IGK_UMIs = [get_umi_count(fq) for fq in IGKs]
-            IGL_UMIs = [get_umi_count(fq) for fq in IGLs]
-
-            medianH = int(np.median(IGH_UMIs))
-            medianL = int(np.median(IGL_UMIs))
-            medianK = int(np.median(IGK_UMIs))
-
-            vdj_sum_summary.append({
-                'item': 'Median UMIs per cell',
-                'count': median_all,
-                'total_count': np.nan
-            })
+            paired_H_K = int(clonetypes[(clonetypes['IGH_CDR3aa']!='None') & (clonetypes['IGK_CDR3aa']!='None')]['Frequency'].sum())
 
             vdj_sum_summary.append({
-                'item': 'Median IGH UMIs per cell',
-                'count': medianH,
-                'total_count': np.nan
+                    'item': 'Cells with paired IGH and IGK',
+                    'count': paired_H_K,
+                    'total_count': productive_cells_num
             })
 
             vdj_sum_summary.append({
-                'item': 'Median IGL UMIs per cell',
-                'count': medianL,
+                'item': 'Median read count per cell',
+                'count': median_all,
                 'total_count': np.nan
             })
 
-            vdj_sum_summary.append({
-                'item': 'Median IGK UMIs per cell',
-                'count': medianK,
-                'total_count': np.nan
-            })
+            for locus in loci:
+                tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq')
+                if len(tmp) != 0:
+                    read_count = [get_read_count(fq) for fq in tmp]    
+                    read_count.sort()
+                    for i in range(len(read_count)):
+                        if read_count[i] != 0:
+                            idx = i
+                            break
+                    read_count = read_count[idx:]
+                    median_tmp = int(np.median(read_count))
+                    vdj_sum_summary.append({
+                    'item': f'Median IG{locus} read count per cell',
+                    'count': median_tmp,
+                    'total_count': np.nan
+                    })
+                else:
+                    vdj_sum_summary.append({
+                    'item': f'Median IG{locus} read count per cell',
+                    'count': 0,
+                    'total_count': np.nan
+                    })
 
         df = pd.DataFrame(vdj_sum_summary, 
             columns=['item', 'count', 'total_count'])
 
         utils.gen_stat(df, stat_file)
 
-    # clonetype table
+        # clonetype table
 
         title = 'Clonetypes'
         table_dict = self.get_table(title, 'clonetypes_table', clonetypes)
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index ce46c374..ce6b1e2d 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -63,7 +63,7 @@ class Res_filter(Step):
 
     @utils.add_log
     def run(self):
-        barcode_report = f'{self.outdir}/../02.truse_assemble/TRUST4/{self.sample}_barcode_report.tsv'
+        barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
         res = beauty_res(self.outdir, barcode_report)
         filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')]
         fre = [''] * filtered.shape[0]
-- 
Gitee


From f494a4b8cad5f22924fc7cc023aacd1bbc302b43 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 15 Jun 2021 14:10:42 +0800
Subject: [PATCH 41/96] change read count to umi count

---
 celescope/tracer_vdj/vdj_sum.py | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 73772bc4..01a25116 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -10,12 +10,17 @@ import glob
 import pysam
 
 
-def get_read_count(fq):
-    count = 0
+
+def get_umi_count(fq):
+    umis = []
     with pysam.FastxFile(fq) as fh:
         for entry in fh:
-            count += 1
-
+            name = entry.name
+            name = name.split('_')
+            umi = name[1]
+            umis.append(umi)
+    count = len(set(umis))
+             
     return count
 
 
@@ -48,7 +53,7 @@ def filtering(Seqtype, ass_dir, outdir):
 
     if Seqtype == 'TCR':
         data = tpm_count(ass_dir)
-        cell_name = list(set(list(data['cell_name']))).sort()
+        cell_name = sorted(list(set(list(data['cell_name']))))
         filtered = pd.DataFrame()
         df = pd.DataFrame(cell_name, columns=['cell_name'])
         loci = ['A', 'B']
@@ -75,7 +80,7 @@ def filtering(Seqtype, ass_dir, outdir):
 
         data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t')
         data = data[(data['FUNCTIONAL'] == True) & (data['IN_FRAME'] == True)]
-        cell_name = list(set(data['CELL'].tolist())).sort()
+        cell_name = sorted(list(set(data['CELL'].tolist())))
         filtered = pd.DataFrame()
 
         tmp = data[data['LOCUS'] == 'H']
@@ -213,7 +218,7 @@ class Vdj_sum(Step):
             })
 
             vdj_sum_summary.append({
-                'item': 'Median read count per cell',
+                'item': 'Median UMIs per cell',
                 'count': median_all,
                 'total_count': np.nan
             })            
@@ -221,7 +226,7 @@ class Vdj_sum(Step):
             for locus in loci:
                 tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq')
                 if len(tmp) != 0:
-                    read_count = [get_read_count(fq) for fq in tmp]
+                    read_count = [get_umi_count(fq) for fq in tmp]
                     read_count.sort()
                     for i in range(len(read_count)):
                         if read_count[i] != 0:
@@ -230,13 +235,13 @@ class Vdj_sum(Step):
                     read_count = read_count[idx:]
                     median_tmp = int(np.median(read_count))
                     vdj_sum_summary.append({
-                        'item': f'Median TR{locus} read count per cell',
+                        'item': f'Median TR{locus} UMIs per cell',
                         'count': median_tmp,
                         'total_count': np.nan
                     })
                 else:
                     vdj_sum_summary.append({
-                    'item': f'Median TR{locus} read count per cell',
+                    'item': f'Median TR{locus} UMIs per cell',
                     'count': 0,
                     'total_count': np.nan
                     })
@@ -306,7 +311,7 @@ class Vdj_sum(Step):
             })
 
             vdj_sum_summary.append({
-                'item': 'Median read count per cell',
+                'item': 'Median UMIs per cell',
                 'count': median_all,
                 'total_count': np.nan
             })
@@ -314,7 +319,7 @@ class Vdj_sum(Step):
             for locus in loci:
                 tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq')
                 if len(tmp) != 0:
-                    read_count = [get_read_count(fq) for fq in tmp]    
+                    read_count = [get_umi_count(fq) for fq in tmp]    
                     read_count.sort()
                     for i in range(len(read_count)):
                         if read_count[i] != 0:
@@ -323,13 +328,13 @@ class Vdj_sum(Step):
                     read_count = read_count[idx:]
                     median_tmp = int(np.median(read_count))
                     vdj_sum_summary.append({
-                    'item': f'Median IG{locus} read count per cell',
+                    'item': f'Median IG{locus} UMIs per cell',
                     'count': median_tmp,
                     'total_count': np.nan
                     })
                 else:
                     vdj_sum_summary.append({
-                    'item': f'Median IG{locus} read count per cell',
+                    'item': f'Median IG{locus} UMIs per cell',
                     'count': 0,
                     'total_count': np.nan
                     })
-- 
Gitee


From 6ffd3fe560e227996688e91ac6b21ab0fcbd8928 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 15 Jun 2021 14:10:57 +0800
Subject: [PATCH 42/96] rm paired fq

---
 celescope/tools/barcode.py | 13 +------------
 1 file changed, 1 insertion(+), 12 deletions(-)

diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py
index 1c48f6e3..143ce6d1 100755
--- a/celescope/tools/barcode.py
+++ b/celescope/tools/barcode.py
@@ -225,17 +225,12 @@ class Barcode(Step):
         self.lowNum = args.lowNum
         self.lowQual = args.lowQual
         self.allowNoPolyT = args.allowNoPolyT
-        self.allowNoLinker = args.allowNoLinker
-        self.paired_fq = args.paired_fq
-        self.new_f1 = f'{self.outdir}/{self.sample}_new_R1.fq{suffix}'
-        self.new_f2 = f'{self.outdir}/{self.sample}_new_R2.fq{suffix}'   
+        self.allowNoLinker = args.allowNoLinker   
 
     @utils.add_log
     def run(self):
 
         fh3 = xopen(self.out_fq2, 'w')
-        new_f1 = xopen(self.new_f1, 'w')
-        new_f2 = xopen(self.new_f2, 'w')
 
         if self.nopolyT:
             fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w')
@@ -383,11 +378,6 @@ class Barcode(Step):
 
                 fh3.write(f'@{cb}_{umi}_{self.total_num}\n{seq2}\n+\n{qual2}\n')
 
-                if self.paired_fq:
-
-                    new_f1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n')
-                    new_f2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n')
-
             Barcode.run.logger.info(self.fq1_list[i] + ' finished.')
         fh3.close()
 
@@ -498,7 +488,6 @@ def get_opts_barcode(parser, sub_program=True):
     parser.add_argument('--gzip', help="output gzipped fastq", action='store_true')
     parser.add_argument(
         '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto')
-    parser.add_argument('--paired_fq', help="output R1 R2", action='store_true')
     if sub_program:
         parser.add_argument('--fq1', help='read1 fq file', required=True)
         parser.add_argument('--fq2', help='read2 fq file', required=True)
-- 
Gitee


From 0c83c8bd0c7d567f1b062744547b48e7cb9804c5 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 15 Jun 2021 14:11:12 +0800
Subject: [PATCH 43/96] add convert step

---
 celescope/trust_vdj/__init__.py        |   2 +-
 celescope/trust_vdj/convert.py         | 336 +++++++++++++++++++++++++
 celescope/trust_vdj/multi_trust_vdj.py |  15 +-
 3 files changed, 350 insertions(+), 3 deletions(-)
 create mode 100644 celescope/trust_vdj/convert.py

diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py
index 69aeb6f7..f95b64a2 100644
--- a/celescope/trust_vdj/__init__.py
+++ b/celescope/trust_vdj/__init__.py
@@ -1,6 +1,6 @@
 __STEPS__ = [
     'sample',
-    'barcode',
+    'convert',
     'trust_assemble',
     'res_filter']
 __ASSAY__ = 'trust_vdj'
diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py
new file mode 100644
index 00000000..4039a215
--- /dev/null
+++ b/celescope/trust_vdj/convert.py
@@ -0,0 +1,336 @@
+"""barcode step."""
+
+import os
+import re
+import subprocess
+import sys
+import glob
+from collections import defaultdict, Counter
+from itertools import combinations, product
+
+import pandas as pd
+import pysam
+from xopen import xopen
+
+import celescope.tools.utils as utils
+from celescope.tools.__init__ import __PATTERN_DICT__
+from celescope.tools.Chemistry import Chemistry
+from celescope.tools.barcode import *
+from celescope.tools.Step import Step, s_common
+
+
+class Convert(Step):
+
+    '''convert step class
+    '''   
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.fq1_list = args.fq1.split(",")
+        self.fq2_list = args.fq2.split(",")
+        self.fq_number = len(self.fq1_list)
+        if self.fq_number != len(self.fq2_list):
+            raise Exception('fastq1 and fastq2 do not have same file number!')
+        if args.chemistry == 'auto':
+            ch = Chemistry(args.fq1)
+            self.chemistry_list = ch.check_chemistry()
+        else:
+            self.chemistry_list = [args.chemistry] * self.fq_number
+        self.barcode_corrected_num = 0
+        self.linker_corrected_num = 0
+        self.total_num = 0
+        self.clean_num = 0
+        self.no_polyT_num = 0
+        self.lowQual_num = 0
+        self.no_linker_num = 0
+        self.no_barcode_num = 0
+        self.barcode_qual_Counter = Counter()
+        self.umi_qual_Counter = Counter()
+        if args.gzip:
+            suffix = ".gz"
+        else:
+            suffix = ""
+        self.out_fq1 = f'{self.outdir}/{self.sample}_1.fq{suffix}'
+        self.out_fq2 = f'{self.outdir}/{self.sample}_2.fq{suffix}'
+        self.nopolyT = args.nopolyT
+        self.noLinker = args.noLinker
+        self.bool_probe = False
+        if args.probe_file and args.probe_file != 'None':
+            self.bool_probe = True
+            self.probe_count_dic = utils.genDict(dim=3)
+            self.valid_count_dic = utils.genDict(dim=2)
+            self.probe_dic = utils.read_fasta(args.probe_file)
+            self.reads_without_probe = 0
+        self.pattern = args.pattern
+        self.linker = args.linker
+        self.whitelist = args.whitelist
+        self.lowNum = args.lowNum
+        self.lowQual = args.lowQual
+        self.allowNoPolyT = args.allowNoPolyT
+        self.allowNoLinker = args.allowNoLinker   
+
+    @utils.add_log
+    def run(self):
+
+        outfq1 = xopen(self.out_fq1, 'w')
+        outfq2 = xopen(self.out_fq2, 'w')
+
+        if self.nopolyT:
+            fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w')
+            fh2_without_polyT = xopen(self.outdir + '/noPolyT_2.fq', 'w')
+
+        if self.noLinker:
+            fh1_without_linker = xopen(self.outdir + '/noLinker_1.fq', 'w')
+            fh2_without_linker = xopen(self.outdir + '/noLinker_2.fq', 'w')
+
+        for i in range(self.fq_number):
+
+            chemistry = self.chemistry_list[i]
+            lowNum = int(self.lowNum)
+            Convert.run.logger.info(f'lowQual score: {self.lowQual}')
+            lowQual = int(self.lowQual)
+            if chemistry == 'scopeV1':
+                lowNum = min(0, lowNum)
+                lowQual = max(10, lowQual)
+                Convert.run.logger.info(f'scopeV1: lowNum={lowNum}, lowQual={lowQual} ')
+            # get linker and whitelist
+            bc_pattern = __PATTERN_DICT__[chemistry]
+            if (bc_pattern):
+                (linker, whitelist) = get_scope_bc(chemistry)
+            else:
+                bc_pattern = self.pattern
+                linker = self.linker
+                whitelist = self.whitelist
+            if not bc_pattern:
+                raise Exception("invalid bc_pattern!")
+
+            # parse pattern to dict, C8L10C8L10C8U8
+            # defaultdict(<type 'list'>, {'C': [[0, 8], [18, 26], [36, 44]], 'U':
+            # [[44, 52]], 'L': [[8, 18], [26, 36]]})
+            pattern_dict = parse_pattern(bc_pattern)
+
+            bool_T = True if 'T' in pattern_dict else False
+            bool_L = True if 'L' in pattern_dict else False
+            bool_whitelist = (whitelist is not None) and whitelist != "None"
+            C_len = sum([item[1] - item[0] for item in pattern_dict['C']])
+
+            if bool_whitelist:
+                seq_list, _ = utils.read_one_col(whitelist)
+                barcode_correct_set, barcode_mismatch_dict = get_all_mismatch(seq_list, n_mismatch=1)
+                barcode_correct_set_list = [barcode_correct_set] * 3
+                barcode_mismatch_dict_list = [barcode_mismatch_dict] * 3
+            if bool_L:
+                seq_list, _ = utils.read_one_col(linker)
+                check_seq(linker, pattern_dict, "L")
+                linker_correct_set_list = []
+                linker_mismatch_dict_list = []
+                start = 0
+                for item in pattern_dict['L']:
+                    end = start + item[1] - item[0]
+                    linker_seq_list = [seq[start:end] for seq in seq_list]
+                    linker_correct_set, linker_mismatch_dict = get_all_mismatch(linker_seq_list, n_mismatch=2)
+                    linker_correct_set_list.append(linker_correct_set)
+                    linker_mismatch_dict_list.append(linker_mismatch_dict)
+                    start = end
+
+            fq1 = pysam.FastxFile(self.fq1_list[i], persist=False)
+            fq2 = pysam.FastxFile(self.fq2_list[i], persist=False)
+
+            for entry1 in fq1:
+                entry2 = next(fq2)
+                header1, seq1, qual1 = entry1.name, entry1.sequence, entry1.quality
+                header2, seq2, qual2 = entry2.name, entry2.sequence, entry2.quality
+                self.total_num += 1
+
+                # polyT filter
+                if bool_T and (not self.allowNoPolyT):
+                    polyT = seq_ranges(seq1, pattern_dict['T'])
+                    if polyT.count('T') < MIN_T:
+                        self.no_polyT_num += 1
+                        if self.nopolyT:
+                            fh1_without_polyT.write(
+                                '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1))
+                            fh2_without_polyT.write(
+                                '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2))
+                        continue
+
+                # lowQual filter                
+                C_U_quals_ascii = seq_ranges(
+                    qual1, pattern_dict['C'] + pattern_dict['U'])
+                # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii]
+                if lowQual > 0 and low_qual(C_U_quals_ascii, lowQual, lowNum):
+                    self.lowQual_num += 1
+                    continue
+
+                # linker filter
+                if bool_L and (not self.allowNoLinker):
+                    seq_list = get_seq_list(seq1, pattern_dict, 'L')
+                    bool_valid, bool_corrected, _ = check_seq_mismatch(
+                        seq_list, linker_correct_set_list, linker_mismatch_dict_list)
+                    if not bool_valid:
+                        self.no_linker_num += 1
+                        if self.noLinker:
+                            fh1_without_linker.write(
+                                '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1))
+                            fh2_without_linker.write(
+                                '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2))
+                        continue
+                    elif bool_corrected:
+                        self.linker_corrected_num += 1
+                
+                # barcode filter
+                seq_list = get_seq_list(seq1, pattern_dict, 'C')
+                if bool_whitelist:
+                    bool_valid, bool_corrected, corrected_seq = check_seq_mismatch(
+                        seq_list, barcode_correct_set_list, barcode_mismatch_dict_list)
+
+                    if not bool_valid:
+                        self.no_barcode_num += 1
+                        continue
+                    elif bool_corrected:
+                        self.barcode_corrected_num += 1
+                    cb = corrected_seq
+                else:
+                    cb = "".join(seq_list)
+
+                umi = seq_ranges(seq1, pattern_dict['U'])
+
+                self.clean_num += 1
+
+                if self.bool_probe:
+                    # valid count
+                    read_name_probe = 'None'
+                    self.valid_count_dic[cb][umi] += 1
+
+                    # output probe UMi and read count
+                    find_probe = False
+                    for probe_name in self.probe_dic:
+                        probe_seq = self.probe_dic[probe_name]
+                        probe_seq = probe_seq.upper()
+                        if seq1.find(probe_seq) != -1:
+                            self.probe_count_dic[probe_name][cb][umi] += 1
+                            read_name_probe = probe_name
+                            find_probe = True
+                            break
+
+                    if not find_probe:
+                        self.reads_without_probe += 1
+
+                self.barcode_qual_Counter.update(C_U_quals_ascii[:C_len])
+                self.umi_qual_Counter.update(C_U_quals_ascii[C_len:])
+
+                outfq1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n')
+
+                outfq2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n')
+
+            Convert.run.logger.info(self.fq1_list[i] + ' finished.')
+        outfq1.close()
+        outfq2.close()
+
+        # logging
+        Convert.run.logger.info(
+            f'processed reads: {utils.format_number(self.total_num)}. '
+            f'valid reads: {utils.format_number(self.clean_num)}. '
+        )
+
+        Convert.run.logger.info(f'no polyT reads number : {self.no_polyT_num}')
+        Convert.run.logger.info(f'low qual reads number: {self.lowQual_num}')
+        Convert.run.logger.info(f'no_linker: {self.no_linker_num}')
+        Convert.run.logger.info(f'no_barcode: {self.no_barcode_num}')
+        Convert.run.logger.info(f'corrected linker: {self.linker_corrected_num}')
+        Convert.run.logger.info(f'corrected barcode: {self.barcode_corrected_num}')
+
+        if self.clean_num == 0:
+            raise Exception(
+                'no valid reads found! please check the --chemistry parameter.')
+
+        if self.bool_probe:
+            # total probe summary
+            total_umi = 0
+            total_valid_read = 0
+            for cb in self.valid_count_dic:
+                total_umi += len(self.valid_count_dic[cb])
+                total_valid_read += sum(self.valid_count_dic[cb].values())
+
+            # probe summary
+            count_list = []
+            for probe_name in self.probe_dic:
+                UMI_count = 0
+                read_count = 0
+                if probe_name in self.probe_count_dic:
+                    for cb in self.probe_count_dic[probe_name]:
+                        UMI_count += len(self.probe_count_dic[probe_name][cb])
+                        read_count += sum(self.probe_count_dic[probe_name][cb].values())
+                count_list.append(
+                    {"probe_name": probe_name, "UMI_count": UMI_count, "read_count": read_count})
+
+            df_count = pd.DataFrame(count_list, columns=[
+                                    "probe_name", "read_count", "UMI_count"])
+
+            def format_percent(x):
+                x = str(round(x*100, 2))+"%"
+                return x
+            df_count["read_fraction"] = (
+                df_count["read_count"]/total_valid_read).apply(format_percent)
+            df_count["UMI_fraction"] = (
+                df_count["UMI_count"]/total_umi).apply(format_percent)
+            df_count.sort_values(by="UMI_count", inplace=True, ascending=False)
+            df_count_file = self.outdir + '/' + self.sample + '_probe_count.tsv'
+            df_count.to_csv(df_count_file, sep="\t", index=False)
+
+        # stat
+        BarcodesQ30 = sum([self.barcode_qual_Counter[k] for k in self.barcode_qual_Counter if k >= ord2chr(
+            30)]) / float(sum(self.barcode_qual_Counter.values())) * 100
+        UMIsQ30 = sum([self.umi_qual_Counter[k] for k in self.umi_qual_Counter if k >= ord2chr(
+            30)]) / float(sum(self.umi_qual_Counter.values())) * 100
+
+        def cal_percent(x): return "{:.2%}".format((x + 0.0) / self.total_num)
+        stat_info = '''
+            Raw Reads: %s
+            Valid Reads: %s(%s)
+            Q30 of Barcodes: %.2f%%
+            Q30 of UMIs: %.2f%%
+        '''
+        with open(self.outdir + '/stat.txt', 'w') as fh:
+            stat_info = stat_info % (utils.format_number(self.total_num), utils.format_number(self.clean_num),
+                                    cal_percent(self.clean_num), BarcodesQ30,
+                                    UMIsQ30)
+            stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M)
+            fh.write(stat_info)
+        
+        # self.fastqc()
+        self.clean_up()
+
+
+@utils.add_log
+def convert(args):
+    step_name = "convert"
+    convert_obj = Convert(args, step_name)
+    convert_obj.run()
+
+
+def get_opts_convert(parser, sub_program=True):
+    parser.add_argument('--pattern', help='')
+    parser.add_argument('--whitelist', help='')
+    parser.add_argument('--linker', help='')
+    parser.add_argument('--lowQual', type=int,
+                        help='max phred of base as lowQual, default=0', default=0)
+    parser.add_argument(
+        '--lowNum', type=int, help='max number with lowQual allowed, default=2', default=2)
+    parser.add_argument('--nopolyT', action='store_true',
+                        help='output nopolyT fq')
+    parser.add_argument('--noLinker', action='store_true',
+                        help='output noLinker fq')
+    parser.add_argument('--probe_file', help="probe fasta file")
+    parser.add_argument('--allowNoPolyT', help="allow reads without polyT", action='store_true')
+    parser.add_argument('--allowNoLinker', help="allow reads without correct linker", action='store_true')
+    parser.add_argument('--gzip', help="output gzipped fastq", action='store_true')
+    parser.add_argument(
+        '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto')
+    if sub_program:
+        parser.add_argument('--fq1', help='read1 fq file', required=True)
+        parser.add_argument('--fq2', help='read2 fq file', required=True)
+        parser = s_common(parser)
+
+    return parser
+
diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py
index 93ecdedb..8c89ad45 100644
--- a/celescope/trust_vdj/multi_trust_vdj.py
+++ b/celescope/trust_vdj/multi_trust_vdj.py
@@ -4,11 +4,22 @@ from celescope.tools.Multi import Multi
 
 class Multi_trust_vdj(Multi):
 
+    def convert(self, sample):
+        step = 'convert'
+        arr = self.fq_dict[sample]
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--fq1 {arr[0]} --fq2 {arr[1]} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=1)
+
+
     def trust_assemble(self, sample):
         step = 'trust_assemble'
         cmd_line = self.get_cmd_line(step, sample)
-        fq1 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R1.fq{self.fq_suffix}'
-        fq2 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R2.fq{self.fq_suffix}' 
+        fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}'
+        fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' 
         cmd = (
             f'{cmd_line} '
             f'--fq1 {fq1} '
-- 
Gitee


From 5bc30611654a630c56c183f59006fe5880af037e Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 16 Jun 2021 14:44:45 +0800
Subject: [PATCH 44/96] refactor variant calling

---
 celescope/__init__.py                         |   6 +
 celescope/snp/__init__.py                     |   2 +-
 celescope/snp/multi_snp.py                    |  10 +-
 celescope/snp/snpCalling.py                   |  43 --
 celescope/snp/variant_calling.py              | 430 ++++++++++++++++++
 .../html/snp/snpCalling_summary.html          |  37 --
 .../html/snp/variant_calling_summary.html     |  36 ++
 .../tests/{func_tests.py => test_function.py} |   3 +
 celescope/tools/step.py                       |   9 +-
 celescope/tools/target_metrics.py             |   5 +-
 celescope/tools/utils.py                      |  41 +-
 11 files changed, 509 insertions(+), 113 deletions(-)
 delete mode 100755 celescope/snp/snpCalling.py
 create mode 100755 celescope/snp/variant_calling.py
 delete mode 100755 celescope/templates/html/snp/snpCalling_summary.html
 create mode 100644 celescope/templates/html/snp/variant_calling_summary.html
 rename celescope/tests/{func_tests.py => test_function.py} (90%)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index 28a2454c..bea81b71 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -21,3 +21,9 @@ ASSAY_DICT = {
 ROOT_PATH = os.path.dirname(__file__)
 
 RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ]
+
+HELP_DICT = {
+    'match_dir': 'Match celescope scRNA-Seq directory.',
+    'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.',
+
+}
diff --git a/celescope/snp/__init__.py b/celescope/snp/__init__.py
index ef961a09..c9f17e51 100755
--- a/celescope/snp/__init__.py
+++ b/celescope/snp/__init__.py
@@ -1,7 +1,7 @@
 __STEPS__ = [
     'mkref',
     'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts', 
-    'target_metrics', 'snpCalling', 'analysis_snp'
+    'target_metrics', 'variant_calling', 'analysis_snp'
 ]
 __ASSAY__ = 'snp'
 IMPORT_DICT = {
diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index 8c692773..89781522 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -31,8 +31,8 @@ class Multi_snp(Multi):
         self.process_cmd(cmd, step, sample, m=2, x=1)
 
 
-    def snpCalling(self, sample):
-        step = 'snpCalling'
+    def variant_calling(self, sample):
+        step = 'variant_calling'
         cmd_line = self.get_cmd_line(step, sample)
         bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered.bam'
         cmd = (
@@ -44,9 +44,9 @@ class Multi_snp(Multi):
 
     def analysis_snp(self, sample):
         step = 'analysis_snp'
-        vcf = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_merged.vcf'
-        CID_file = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_CID.tsv'
-        variant_count_file = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_variant_count.tsv'
+        vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_merged.vcf'
+        CID_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_CID.tsv'
+        variant_count_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_variant_count.tsv'
         cmd_line = self.get_cmd_line(step, sample)
         cmd = (
             f'{cmd_line} '
diff --git a/celescope/snp/snpCalling.py b/celescope/snp/snpCalling.py
deleted file mode 100755
index 35802ecc..00000000
--- a/celescope/snp/snpCalling.py
+++ /dev/null
@@ -1,43 +0,0 @@
-import os
-
-from mutract.utils import Mutract
-
-import celescope.tools.utils as utils
-from celescope.tools.step import s_common
-
-
-@utils.add_log
-def snpCalling(args):
-
-    sample = args.sample
-    outdir = args.outdir
-    thread = int(args.thread)
-    match_dir = args.match_dir
-    bam = args.bam
-    genomeDir = args.genomeDir
-    gene_list_file = args.gene_list
-
-    # check dir
-    if not os.path.exists(outdir):
-        os.system('mkdir -p %s' % (outdir))
-
-    # get args
-    _refFlat, _gtf, fasta = utils.glob_genomeDir(genomeDir, fa=True)
-    _match_barcode, (_cell_total, match_barcode_file) = utils.read_barcode_file(match_dir, return_file=True)
-
-    # mutract
-    obj = Mutract(
-        outdir, sample, bam, fasta, 
-        match_barcode_file, thread=thread, gene_file=gene_list_file
-    )
-    obj.run()
-
-
-def get_opts_snpCalling(parser, sub_program):
-    if sub_program:
-        s_common(parser)
-        parser.add_argument("--bam", help='featureCounts bam', required=True)
-        parser.add_argument(
-        "--match_dir", help="match scRNA-Seq dir", required=True)
-    parser.add_argument("--genomeDir", help='genomeDir', required=True)
-    parser.add_argument("--gene_list", help='gene_list', required=True)
diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
new file mode 100755
index 00000000..ec5e7c02
--- /dev/null
+++ b/celescope/snp/variant_calling.py
@@ -0,0 +1,430 @@
+import logging
+import os
+import subprocess
+from collections import defaultdict
+from concurrent.futures import ProcessPoolExecutor
+
+import pandas as pd
+import pysam
+from scipy.io import mmwrite
+from scipy.sparse import coo_matrix
+
+import celescope.tools.utils as utils
+from celescope.__init__ import HELP_DICT
+from celescope.tools.step import Step, s_common
+from celescope.rna.mkref import parse_genomeDir_rna
+
+
+class Variant_calling(Step):
+    """
+    Features
+    - Perform variant calling
+
+    Output
+
+    `{sample}_VID.tsv` A unique numeric ID is assigned for each variant.
+
+    `{sample}_CID.tsv` A unique numeric ID is assigned for each cell.
+
+    `{sample}_variant_count.tsv`  Reference and variant supporting reads/UMIs count.
+
+    `{sample}_support.mtx` Support matrix, only high quality bases are considered.   
+    0 : no reads/UMIs cover the position.  
+    1 : all reads/UMIs at the position support the ref allele.  
+    2 : all reads/UMIs at the position support the alt allele.  
+    3 : one or more reads/UMIs support both the alt and the ref allele.  
+    """
+    
+    def __init__(self, args, step_name):    
+        Step.__init__(self, args, step_name)
+
+        # set
+        self.barcodes, _num = utils.read_barcode_file(args.match_dir)
+        self.fasta = parse_genomeDir_rna(args.genomeDir)['fasta']
+        if args.vcf:
+            self.vcf_bool = True
+            self.vcf = args.vcf
+        else:
+            self.vcf_bool = False
+            self.vcf = None
+        self.df_vcf = None
+
+        # out 
+        self.splitN_bam = f'{self.out_prefix}_splitN.bam'
+        self.CID_file = f'{self.out_prefix}_CID.tsv'
+        self.VID_file = f'{self.out_prefix}_VID.tsv'
+        self.VID_vcf_file = f'{self.out_prefix}_VID.vcf'
+        self.variant_count_file = f'{self.out_prefix}_variant_count.tsv'
+        self.ref_mtx_file = f'{self.out_prefix}_ref.mtx'
+        self.alt_mtx_file = f'{self.out_prefix}_alt.mtx'
+        self.support_matrix_file = f'{self.out_prefix}_support.mtx'
+
+
+    @utils.add_log
+    def SplitNCigarReads(self):
+        cmd = (
+            f'gatk '
+            f'SplitNCigarReads '
+            f'-R {self.fasta} '
+            f'-I {self.args.bam} '
+            f'-O {self.splitN_bam} '
+        )
+        Variant_calling.SplitNCigarReads.logger.info(cmd)
+        subprocess.check_call(cmd, shell=True)
+
+
+    @utils.add_log
+    def split_bam(self):
+        '''
+        input:
+            bam: bam from splitN
+            barcodes: cell barcodes, list
+        ouput:
+            bam_dict: assign reads to cell barcodes and UMI
+            count_dict: UMI counts per cell
+            CID: assign ID(1-based) to cells
+        '''
+
+        # init
+        bam_dict = defaultdict(list)
+        CID_dict = defaultdict(dict)
+        cells_dir = f'{self.outdir}/cells/'
+    
+        # read bam and split
+        samfile = pysam.AlignmentFile(self.args.bam, "rb")
+        header = samfile.header
+        for read in samfile:
+            attr = read.query_name.split('_')
+            barcode = attr[0]
+            if barcode in self.barcodes:
+                CID = self.barcodes.index(barcode) + 1
+                read.set_tag(tag='CL', value=f'CELL{CID}', value_type='Z')
+
+                # assign read to barcode
+                bam_dict[barcode].append(read)
+
+
+        self.split_bam.logger.info('writing cell bam...')
+        # write new bam
+        CID = 0
+        for barcode in self.barcodes:
+            # init
+            CID += 1
+            CID_dict[CID]['barcode'] = barcode
+            CID_dict[CID]['valid'] = False
+
+            # out bam
+            if barcode in bam_dict:
+                cell_dir = f'{cells_dir}/cell{CID}'
+                cell_bam_file = f'{cell_dir}/cell{CID}.bam'
+                if not os.path.exists(cell_dir):
+                    os.makedirs(cell_dir)
+                CID_dict[CID]['valid'] = True
+                cell_bam = pysam.AlignmentFile(
+                    f'{cell_bam_file}', "wb", header=header)
+                for read in bam_dict[barcode]:
+                    cell_bam.write(read)
+                cell_bam.close()
+
+        # out CID
+        df_CID = pd.DataFrame(CID_dict).T
+        df_CID.index.name = 'CID'
+        df_CID.to_csv(self.CID_file, sep='\t')
+
+    @utils.add_log
+    def call_snp(self, CID):
+
+        self.call_snp.logger.info('Processing Cell {}' % CID)
+        bam = f'{self.outdir}/cells/cell{CID}/cell{CID}.bam'
+        # sort
+        sorted_bam = f'{self.outdir}/cells/cell{CID}/cell{CID}_sorted.bam'
+        cmd_sort = (
+            f'samtools sort {bam} -o {sorted_bam}'
+        )
+        subprocess.check_call(cmd_sort, shell=True)
+    
+        # mpileup
+        bcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.bcf'
+        cmd_mpileup = (
+            f'bcftools mpileup -Ou '
+            f'-f {self.fasta} '
+            f'{sorted_bam} -o {bcf} '
+        )
+        subprocess.check_call(cmd_mpileup, shell=True)
+
+        # call
+        out_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.vcf'
+        cmd_call = (
+            f'bcftools call -mv -Ov '
+            f'-o {out_vcf} '
+            f'{bcf}'
+            f'>/dev/null 2>&1 '
+        )
+        subprocess.check_call(cmd_call, shell=True)
+
+        # norm
+        norm_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
+        cmd_norm = (
+            f'bcftools norm -d none '
+            f'-f {self.fasta} '
+            f'{out_vcf} '
+            f'-o {norm_vcf} '
+        )
+        subprocess.check_call(cmd_norm, shell=True)
+
+        # call all position
+        out_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all.vcf'
+        cmd_all_call = (
+            f'bcftools call -m -Ov '
+            f'-o {out_all_vcf} '
+            f'{bcf}'
+            f'>/dev/null 2>&1 '
+        )
+        subprocess.check_call(cmd_all_call, shell=True)
+
+        # norm all
+        norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
+        cmd_all_norm = (
+            f'bcftools norm -d none '
+            f'-f {self.fasta} '
+            f'{out_all_vcf} '
+            f'-o {norm_all_vcf} '
+        )
+        subprocess.check_call(cmd_all_norm, shell=True)
+
+    def call_all_snp(self):
+        all_res = []
+        _df_index, df_valid = self.read_CID()
+        CID_arg = df_valid.index
+        with ProcessPoolExecutor(self.thread) as pool:
+            for res in pool.map(self.call_snp, CID_arg):
+                all_res.append(res)
+
+    def read_CID(self):
+        df_index = pd.read_csv(self.CID_file, sep='\t', index_col=0, dtype=object)
+        df_valid = df_index[df_index['valid'] == 'True']
+        return df_index, df_valid
+
+    @staticmethod
+    def _parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
+        '''
+        parse vcf into df
+        '''
+        vcf = pysam.VariantFile(vcf_file)
+        df = pd.DataFrame(columns=[col for col in cols] + infos)
+        rec_dict = {}
+        for rec in vcf.fetch():
+
+            for col in cols:
+                rec_dict[col] = getattr(rec, col)
+                # if ref == alt: alleles=(ref,)
+                # else alleles=(ref, alt)
+                if col == 'alleles':
+                    rec_dict['ref'] = rec_dict['alleles'][0]
+                    rec_dict['alt'] = '.'
+                    if len(rec_dict['alleles']) == 2:
+                        rec_dict['alt'] = rec_dict['alleles'][1]
+                    
+            for info in infos:
+                rec_dict[info] = rec.info[info]
+
+            df = df.append(pd.Series(rec_dict),ignore_index=True)
+        return df
+
+    def parse_vcf(self):
+        self.df_vcf = self._parse_vcf(self.vcf_file)
+
+    def merge_vcf(self):
+        '''
+        merge cell vcf into one non-duplicated vcf
+        add VID(variant ID) and CID(cell ID)
+        '''
+        _df_index, df_valid = self.read_CID()
+        CIDs = df_valid.index
+
+        # variant dict
+        v_cols = ['chrom', 'pos', 'alleles']
+        v_dict = {}
+
+        for CID in CIDs:
+            CID = str(CID)
+            vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
+            vcf = pysam.VariantFile(vcf_file,'r')
+            for rec in vcf.fetch():
+                v = ','.join([str(getattr(rec, col)) for col in v_cols])
+                if not v in v_dict:
+                    v_dict[v] = dict()
+                    v_dict[v]['CID'] = [CID]
+                    v_dict[v]['record'] = rec
+                else:
+                    v_dict[v]['CID'].append(CID)
+
+        # output
+        def get_vcf_header(CIDs):
+            CID = CIDs[0]
+            vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
+            vcf = pysam.VariantFile(vcf_file,'r')
+            return vcf.header
+        vcf_header = get_vcf_header(CIDs)
+        merged_vcf_file = f'{self.outdir}/{self.sample}_merged.vcf'
+        vcf_header.info.add('VID', number=1, type='String', description='Variant ID')
+        vcf_header.info.add('CID', number=1, type='String', description='Cell ID')
+        merged_vcf = pysam.VariantFile(merged_vcf_file,'w', header=vcf_header)
+
+        VID = 0
+        for v in sorted(v_dict.keys()):
+            VID += 1
+            rec = v_dict[v]['record']
+            CID = ','.join(v_dict[v]['CID'])
+            record = merged_vcf.new_record()
+            cols = ['chrom', 'pos', 'alleles']
+            for col in cols:
+                setattr(record,col, getattr(rec,col))
+            record.info['VID'] = str(VID)
+            record.info['CID'] = CID
+            merged_vcf.write(record)
+
+        merged_vcf.close()
+        self.vcf = merged_vcf_file
+
+
+    def write_VID_file(self):
+        df_VID = self.df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']]
+        df_VID.to_csv(self.VID_file, sep='\t', index=False)
+
+
+    def add_VID(self):
+        vcf = pysam.VariantFile(self.vcf,'r')
+        vcf_header = vcf.header
+        if 'VID' in vcf_header.info:
+            logging.info('VID is already in vcf file!')
+            return
+        vcf_header.info.add('VID', number=1, type='String', description='Variant ID')
+        VID_vcf = pysam.VariantFile(self.VID_vcf_file, 'w', header=vcf_header)
+        VID = 0
+        for rec in vcf.fetch():
+            VID += 1
+            rec.info['VID'] = str(VID) 
+            VID_vcf.write(rec)
+        VID_vcf.close()
+        self.vcf = self.VID_vcf_file
+
+    @utils.add_log
+    def cell_UMI(self, CID):
+        df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count'])
+        norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
+        df_cell_vcf = self._parse_vcf(norm_all_vcf, infos=['DP4'])
+
+        def get_DP4(row, alt):
+            DP4 = row['DP4'].iloc[0]
+            if alt == 'ref':
+                indexs = [0,1]
+            elif alt == 'alt':
+                indexs = [2,3]
+            umi = sum([DP4[index] for index in indexs])
+            return umi
+
+        def map_vcf_row(row, df_cell_vcf):
+            pos = row['pos']
+            chrom = row['chrom']
+            alt = row['alt']
+            df_pos = df_cell_vcf[(df_cell_vcf['pos']==pos) & (df_cell_vcf['chrom']==chrom)]
+            df_ref = df_pos[df_pos['alt']=='.']
+            df_alt = df_pos[df_pos['alt']==alt]
+            ref_UMI = 0
+            alt_UMI = 0
+            if df_ref.shape[0] != 0:
+                ref_UMI = get_DP4(df_ref, 'ref')
+            if df_alt.shape[0] != 0:
+                alt_UMI = get_DP4(df_alt, 'alt')
+            return ref_UMI, alt_UMI, pos, chrom, alt
+
+        for index in self.df_vcf.index:
+            row = self.df_vcf.loc[index,]
+            ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf)
+            if (ref_UMI + alt_UMI) != 0:
+                VID = row['VID']
+                dic = {
+                    'VID':VID,
+                    'CID':CID,
+                    'ref_count':ref_UMI, 
+                    'alt_count':alt_UMI,
+                }
+                df_UMI = df_UMI.append(dic, ignore_index=True)
+        return df_UMI
+
+    @utils.add_log
+    def get_UMI(self):
+        '''
+        get variant and ref UMI supporting an allele
+        '''
+        _df_index, df_valid =  self.read_CID()
+
+        df_UMI_list = []
+        with ProcessPoolExecutor(self.thread) as pool:
+            for res in pool.map(self.cell_UMI, list(df_valid.index)):
+                df_UMI_list.append(res)
+        
+        df_UMI = pd.concat(df_UMI_list)
+        df_UMI['VID'] = df_UMI['VID'].astype('int')
+        df_UMI.sort_values(by=['VID','CID'], inplace=True)
+        df_UMI.to_csv(self.variant_count_file, sep='\t', index=False)
+   
+    def write_support_matrix(self):
+        def set_support_bit(row):
+            ref_bit = 1 if row['ref_count'] > 0 else 0
+            alt_bit = 2 if row['alt_count'] > 0 else 0
+            support_bit = ref_bit + alt_bit
+            return support_bit
+
+        df_variant_count = pd.read_csv(self.variant_count_file, sep='\t')
+        df_variant_count['support'] = self.df_variant_count.apply(set_support_bit, axis=1)
+        support_mtx = coo_matrix(
+            (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1))
+        )       
+        mmwrite(self.support_mtx_file, support_mtx)
+                    
+
+    def run(self):
+        self.SplitNCigarReads()
+        self.split_bam()
+        self.call_all_snp()
+        if self.vcf_bool:
+            self.add_VID()
+        else:
+            self.merge_vcf()
+        self.parse_vcf()
+        self.write_VID_file()
+        self.get_UMI()
+        self.write_support_matrix()
+
+
+@utils.add_log
+def variant_calling(args):
+
+    step_name = 'variant_calling'
+    variant_calling_obj = Variant_calling(args, step_name)
+    variant_calling_obj.run()
+
+
+def get_opts_variant_calling(parser, sub_program):
+
+    parser.add_argument("--genomeDir", help='Genome directory', required=True)
+    parser.add_argument(
+        "--vcf", 
+        help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
+and use these variants as input vcf.""", 
+        required=False
+    )
+    if sub_program:
+        parser.add_argument(
+            "--bam",
+            help='Input BAM file from step `target_metrics`. ', 
+            required=True
+        )
+        parser.add_argument(
+            "--match_dir", 
+            help=HELP_DICT['match_dir'], 
+            required=True
+        )
+        s_common(parser)
diff --git a/celescope/templates/html/snp/snpCalling_summary.html b/celescope/templates/html/snp/snpCalling_summary.html
deleted file mode 100755
index 9c63da39..00000000
--- a/celescope/templates/html/snp/snpCalling_summary.html
+++ /dev/null
@@ -1,37 +0,0 @@
-    <div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
-      <h2>Variant Calling   <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
-
-      <div class="box">
-        <div class="description" style="display: none;">
-          <p><b>Mean Reads per Cell</b> : number of reads mapped to given genes divided by number of match cells.</p>
-          <p><b>Mean UMIs per Cell</b> : number of UMIs mapped to given genes divided by number of match cells.</p>
-          <p><b>Number of Cells with Variants</b> : number of match cells with as least 1 variant.</p>
-          <p><b>Mean Variants per Cell with Variants</b> : number of total SNPs divided by number of cells with variant.</p>
-        </div>
-        <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
-          {% for item in snpCalling_summary %}
-            {% if loop.index <= (loop.length+1)/2 %}
-            <tr>
-              {% for i in item %} 
-              <td>{{ i|e }}</td>
-              {% endfor %}
-            </tr>
-            {% endif %}
-          {% endfor %}
-        </table>
-
-        <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
-          {% for item in snpCalling_summary %}
-            {% if loop.index > (loop.length+1)/2  %}
-            <tr>
-              {% for i in item %} 
-              <td>{{ i|e }}</td>
-              {% endfor %}
-            </tr>
-            {% endif %}
-          {% endfor %}
-        </table>
-        <div class="clear" ></div>
-      </div>
-    </div>
-
diff --git a/celescope/templates/html/snp/variant_calling_summary.html b/celescope/templates/html/snp/variant_calling_summary.html
new file mode 100644
index 00000000..0cb55764
--- /dev/null
+++ b/celescope/templates/html/snp/variant_calling_summary.html
@@ -0,0 +1,36 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+    <h2>Variant Calling   <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+
+    <div class="box">
+      <div class="description" style="display: none;">
+        <p><b>Mean Reads per Cell</b> : number of reads mapped to given genes divided by number of match cells.</p>
+        <p><b>Mean UMIs per Cell</b> : number of UMIs mapped to given genes divided by number of match cells.</p>
+        <p><b>Number of Cells with Variants</b> : number of match cells with as least 1 variant.</p>
+        <p><b>Mean Variants per Cell with Variants</b> : number of total SNPs divided by number of cells with variant.</p>
+      </div>
+      <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+        {% for item in variant_calling_summary %}
+          {% if loop.index <= (loop.length+1)/2 %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+
+      <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
+        {% for item in variant_calling_summary %}
+          {% if loop.index > (loop.length+1)/2  %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+      <div class="clear" ></div>
+    </div>
+  </div>
\ No newline at end of file
diff --git a/celescope/tests/func_tests.py b/celescope/tests/test_function.py
similarity index 90%
rename from celescope/tests/func_tests.py
rename to celescope/tests/test_function.py
index ecc0601e..8936c9aa 100755
--- a/celescope/tests/func_tests.py
+++ b/celescope/tests/test_function.py
@@ -10,6 +10,7 @@ class Tests(unittest.TestCase):
     def setUp(self):
         pass    
     
+    @unittest.skip("tested")
     def test_stat_to_metric(self):
         os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna')
         args_dict = {
@@ -26,3 +27,5 @@ class Tests(unittest.TestCase):
         obj.stat_to_metric()
         print(obj.content_dict['metric'])
 
+    def test_test(self):
+        assert 0 == 0
\ No newline at end of file
diff --git a/celescope/tools/step.py b/celescope/tools/step.py
index 43f981f7..0c759de3 100755
--- a/celescope/tools/step.py
+++ b/celescope/tools/step.py
@@ -12,6 +12,8 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape
 from celescope.tools.utils import add_log
 
 
+Metric = namedtuple("Metric", "name value total fraction")
+
 def s_common(parser):
     """subparser common arguments
     """
@@ -33,7 +35,7 @@ class Step:
         self.outdir = args.outdir
         self.sample = args.sample
         self.assay = args.assay
-        self.thread = args.thread
+        self.thread = int(args.thread)
         self.debug = args.debug
         # set 
         self.out_prefix = f'{self.outdir}/{self.sample}'
@@ -43,7 +45,6 @@ class Step:
             os.system('mkdir -p %s' % self.outdir)
 
         self.metric_list = []
-        self.Metric = namedtuple("Metric", "name value total fraction")
         self.path_dict = {
             "metric": f'{self.outdir}/../.metrics.json',
             "data": f'{self.outdir}/../.data.json'
@@ -68,7 +69,7 @@ class Step:
     def add_metric(self, name, value=None, total=None, fraction=None):
         '''add metric to metric_list
         '''
-        self.metric_list.append(self.Metric(
+        self.metric_list.append(Metric(
             name=name, value=value, total=total, fraction=fraction
         ))
 
@@ -83,7 +84,7 @@ class Step:
                 fraction = metric.value / metric.total
             if fraction:
                 fraction = round(fraction, 4)
-            metric_list.append(self.Metric(
+            metric_list.append(Metric(
                 name=metric.name,
                 value=metric.value,
                 total=metric.total,
diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py
index ed824222..1ac9dfed 100755
--- a/celescope/tools/target_metrics.py
+++ b/celescope/tools/target_metrics.py
@@ -4,6 +4,7 @@ import pysam
 
 import celescope.tools.utils as utils
 from celescope.tools.step import Step, s_common
+from celescope.__init__ import HELP_DICT
 
 
 class Target_metrics(Step):
@@ -86,9 +87,9 @@ def target_metrics(args):
 
 
 def get_opts_target_metrics(parser, sub_program):
+    parser.add_argument("--gene_list", help=HELP_DICT['gene_list'], required=True)
     if sub_program:
-        parser = s_common(parser)
         parser.add_argument("--bam", help='featureCounts bam', required=True)
         parser.add_argument('--match_dir', help='match_dir', required=True)
-    parser.add_argument("--gene_list", help='gene_list', required=True)
+        parser = s_common(parser)
 
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 6baa77b6..6741903a 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -97,27 +97,6 @@ def arg_str(arg, arg_name):
     return ''
 
 
-def read_barcode_file(match_dir, return_file=False):
-    '''
-    multi version compatible
-    '''
-    match_barcode_file1 = glob.glob(
-        f"{match_dir}/*count*/*_cellbarcode.tsv")
-    match_barcode_file2 = glob.glob(
-        f"{match_dir}/*count*/*matrix_10X/*_cellbarcode.tsv")
-    match_barcode_file3 = glob.glob(
-        f"{match_dir}/*count*/*matrix_10X/*barcodes.tsv")
-    match_barcode_file = (
-        match_barcode_file1 +
-        match_barcode_file2 +
-        match_barcode_file3)[0]
-    match_barcode, cell_total = read_one_col(match_barcode_file)
-    match_barcode = set(match_barcode)
-    if return_file:
-        return match_barcode, (cell_total, match_barcode_file)
-    return match_barcode, cell_total
-
-
 def format_stat(count, total_count):
     percent = round(count / total_count * 100, 2)
     string = f'{format_number(count)}({percent}%)'
@@ -696,6 +675,26 @@ def parse_annovar(annovar_file):
     return df
 
 
+def read_barcode_file(match_dir, return_file=False):
+    '''
+    multi version compatible
+    '''
+    match_barcode_file1 = glob.glob(
+        f"{match_dir}/*count*/*_cellbarcode.tsv")
+    match_barcode_file2 = glob.glob(
+        f"{match_dir}/*count*/*matrix_10X/*_cellbarcode.tsv")
+    match_barcode_file3 = glob.glob(
+        f"{match_dir}/*count*/*matrix_10X/*barcodes.tsv")
+    match_barcode_file = (
+        match_barcode_file1 +
+        match_barcode_file2 +
+        match_barcode_file3)[0]
+    match_barcode, cell_total = read_one_col(match_barcode_file)
+    if return_file:
+        return match_barcode, (cell_total, match_barcode_file)
+    return match_barcode, cell_total
+
+
 def parse_match_dir(match_dir):
     match_dict = {}
     match_barcode, cell_total = read_barcode_file(match_dir)
-- 
Gitee


From 3b14e7bc6e1a3b9bb01d7d33b1664b293cb268a1 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 16 Jun 2021 18:08:35 +0800
Subject: [PATCH 45/96] tested

---
 .gitignore                                  |   3 +
 celescope/snp/multi_snp.py                  |   2 +-
 celescope/snp/tests/test_variant_calling.py |  38 ++++++
 celescope/snp/variant_calling.py            | 138 ++++++++++----------
 celescope/tools/multi.py                    |   2 +-
 release_local.py                            |   4 +-
 6 files changed, 116 insertions(+), 71 deletions(-)
 create mode 100644 celescope/snp/tests/test_variant_calling.py

diff --git a/.gitignore b/.gitignore
index be308f44..30d2a341 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,6 @@
+# test output
+test_output/
+
 # vscode
 .vscode/
 
diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index 89781522..ea96c69c 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -44,7 +44,7 @@ class Multi_snp(Multi):
 
     def analysis_snp(self, sample):
         step = 'analysis_snp'
-        vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_merged.vcf'
+        vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}.vcf'
         CID_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_CID.tsv'
         variant_count_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_variant_count.tsv'
         cmd_line = self.get_cmd_line(step, sample)
diff --git a/celescope/snp/tests/test_variant_calling.py b/celescope/snp/tests/test_variant_calling.py
new file mode 100644
index 00000000..5d612b00
--- /dev/null
+++ b/celescope/snp/tests/test_variant_calling.py
@@ -0,0 +1,38 @@
+import unittest
+import os
+from collections import namedtuple
+from celescope.snp.variant_calling import Variant_calling
+
+ROOT_DIR = os.path.dirname(__file__)
+
+class Test_variant_calling(unittest.TestCase):
+    def setUp(self):
+        os.chdir(ROOT_DIR)
+        Args = namedtuple("Args", "thread outdir sample assay debug " + "genomeDir vcf bam match_dir")
+        self.args = Args(
+           thread=10,
+           outdir="./test_output/07.variant_calling",
+           sample="test1",
+           assay="snp",
+           debug=False,
+           genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92",
+           vcf=None,
+           bam="./test_data/06.target_metrics/subset_filter.bam",
+           match_dir="./test_data/match_dir",
+        )   
+
+    def test_run(self):
+        obj = Variant_calling(self.args, "variant_calling")
+        '''
+        obj.SplitNCigarReads()
+        obj.split_bam()
+        obj.call_all_snp()
+        if obj.vcf_bool:
+            obj.add_VID()
+        else:
+            obj.merge_vcf()
+        '''
+        obj.write_VID_file()
+        obj.get_UMI()
+        obj.write_support_matrix()
+        obj.clean_up()
\ No newline at end of file
diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index ec5e7c02..2edc3e24 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -15,6 +15,33 @@ from celescope.tools.step import Step, s_common
 from celescope.rna.mkref import parse_genomeDir_rna
 
 
+
+def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
+    '''
+    parse vcf into df
+    '''
+    vcf = pysam.VariantFile(vcf_file)
+    df = pd.DataFrame(columns=list(cols) + list(infos))
+    rec_dict = {}
+    for rec in vcf.fetch():
+
+        for col in cols:
+            rec_dict[col] = getattr(rec, col)
+            # if ref == alt: alleles=(ref,)
+            # else alleles=(ref, alt)
+            if col == 'alleles':
+                rec_dict['ref'] = rec_dict['alleles'][0]
+                rec_dict['alt'] = '.'
+                if len(rec_dict['alleles']) == 2:
+                    rec_dict['alt'] = rec_dict['alleles'][1]
+                
+        for info in infos:
+            rec_dict[info] = rec.info[info]
+
+        df = df.append(pd.Series(rec_dict),ignore_index=True)
+    return df
+
+
 class Variant_calling(Step):
     """
     Features
@@ -43,20 +70,16 @@ class Variant_calling(Step):
         self.fasta = parse_genomeDir_rna(args.genomeDir)['fasta']
         if args.vcf:
             self.vcf_bool = True
-            self.vcf = args.vcf
         else:
             self.vcf_bool = False
-            self.vcf = None
         self.df_vcf = None
 
         # out 
         self.splitN_bam = f'{self.out_prefix}_splitN.bam'
         self.CID_file = f'{self.out_prefix}_CID.tsv'
         self.VID_file = f'{self.out_prefix}_VID.tsv'
-        self.VID_vcf_file = f'{self.out_prefix}_VID.vcf'
+        self.final_vcf_file = f'{self.out_prefix}.vcf'
         self.variant_count_file = f'{self.out_prefix}_variant_count.tsv'
-        self.ref_mtx_file = f'{self.out_prefix}_ref.mtx'
-        self.alt_mtx_file = f'{self.out_prefix}_alt.mtx'
         self.support_matrix_file = f'{self.out_prefix}_support.mtx'
 
 
@@ -91,7 +114,7 @@ class Variant_calling(Step):
         cells_dir = f'{self.outdir}/cells/'
     
         # read bam and split
-        samfile = pysam.AlignmentFile(self.args.bam, "rb")
+        samfile = pysam.AlignmentFile(self.splitN_bam, "rb")
         header = samfile.header
         for read in samfile:
             attr = read.query_name.split('_')
@@ -131,29 +154,30 @@ class Variant_calling(Step):
         df_CID.index.name = 'CID'
         df_CID.to_csv(self.CID_file, sep='\t')
 
+    @staticmethod
     @utils.add_log
-    def call_snp(self, CID):
+    def call_snp(CID, outdir, fasta):
 
-        self.call_snp.logger.info('Processing Cell {}' % CID)
-        bam = f'{self.outdir}/cells/cell{CID}/cell{CID}.bam'
+        Variant_calling.call_snp.logger.info('Processing Cell %s' % CID)
+        bam = f'{outdir}/cells/cell{CID}/cell{CID}.bam'
         # sort
-        sorted_bam = f'{self.outdir}/cells/cell{CID}/cell{CID}_sorted.bam'
+        sorted_bam = f'{outdir}/cells/cell{CID}/cell{CID}_sorted.bam'
         cmd_sort = (
             f'samtools sort {bam} -o {sorted_bam}'
         )
         subprocess.check_call(cmd_sort, shell=True)
     
         # mpileup
-        bcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.bcf'
+        bcf = f'{outdir}/cells/cell{CID}/cell{CID}.bcf'
         cmd_mpileup = (
             f'bcftools mpileup -Ou '
-            f'-f {self.fasta} '
+            f'-f {fasta} '
             f'{sorted_bam} -o {bcf} '
         )
         subprocess.check_call(cmd_mpileup, shell=True)
 
         # call
-        out_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.vcf'
+        out_vcf = f'{outdir}/cells/cell{CID}/cell{CID}.vcf'
         cmd_call = (
             f'bcftools call -mv -Ov '
             f'-o {out_vcf} '
@@ -163,17 +187,17 @@ class Variant_calling(Step):
         subprocess.check_call(cmd_call, shell=True)
 
         # norm
-        norm_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
+        norm_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
         cmd_norm = (
             f'bcftools norm -d none '
-            f'-f {self.fasta} '
+            f'-f {fasta} '
             f'{out_vcf} '
             f'-o {norm_vcf} '
         )
         subprocess.check_call(cmd_norm, shell=True)
 
         # call all position
-        out_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all.vcf'
+        out_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all.vcf'
         cmd_all_call = (
             f'bcftools call -m -Ov '
             f'-o {out_all_vcf} '
@@ -183,21 +207,24 @@ class Variant_calling(Step):
         subprocess.check_call(cmd_all_call, shell=True)
 
         # norm all
-        norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
+        norm_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
         cmd_all_norm = (
             f'bcftools norm -d none '
-            f'-f {self.fasta} '
+            f'-f {fasta} '
             f'{out_all_vcf} '
             f'-o {norm_all_vcf} '
         )
         subprocess.check_call(cmd_all_norm, shell=True)
 
+    @utils.add_log
     def call_all_snp(self):
         all_res = []
         _df_index, df_valid = self.read_CID()
         CID_arg = df_valid.index
+        outdir_arg = [self.outdir] * len(CID_arg)
+        fasta_arg = [self.fasta] * len(CID_arg)
         with ProcessPoolExecutor(self.thread) as pool:
-            for res in pool.map(self.call_snp, CID_arg):
+            for res in pool.map(self.call_snp, CID_arg, outdir_arg, fasta_arg):
                 all_res.append(res)
 
     def read_CID(self):
@@ -205,37 +232,11 @@ class Variant_calling(Step):
         df_valid = df_index[df_index['valid'] == 'True']
         return df_index, df_valid
 
-    @staticmethod
-    def _parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
-        '''
-        parse vcf into df
-        '''
-        vcf = pysam.VariantFile(vcf_file)
-        df = pd.DataFrame(columns=[col for col in cols] + infos)
-        rec_dict = {}
-        for rec in vcf.fetch():
-
-            for col in cols:
-                rec_dict[col] = getattr(rec, col)
-                # if ref == alt: alleles=(ref,)
-                # else alleles=(ref, alt)
-                if col == 'alleles':
-                    rec_dict['ref'] = rec_dict['alleles'][0]
-                    rec_dict['alt'] = '.'
-                    if len(rec_dict['alleles']) == 2:
-                        rec_dict['alt'] = rec_dict['alleles'][1]
-                    
-            for info in infos:
-                rec_dict[info] = rec.info[info]
-
-            df = df.append(pd.Series(rec_dict),ignore_index=True)
-        return df
-
-    def parse_vcf(self):
-        self.df_vcf = self._parse_vcf(self.vcf_file)
 
+    @utils.add_log
     def merge_vcf(self):
         '''
+        if vcf not provided,
         merge cell vcf into one non-duplicated vcf
         add VID(variant ID) and CID(cell ID)
         '''
@@ -266,10 +267,9 @@ class Variant_calling(Step):
             vcf = pysam.VariantFile(vcf_file,'r')
             return vcf.header
         vcf_header = get_vcf_header(CIDs)
-        merged_vcf_file = f'{self.outdir}/{self.sample}_merged.vcf'
         vcf_header.info.add('VID', number=1, type='String', description='Variant ID')
         vcf_header.info.add('CID', number=1, type='String', description='Cell ID')
-        merged_vcf = pysam.VariantFile(merged_vcf_file,'w', header=vcf_header)
+        merged_vcf = pysam.VariantFile(self.final_vcf_file,'w', header=vcf_header)
 
         VID = 0
         for v in sorted(v_dict.keys()):
@@ -283,37 +283,37 @@ class Variant_calling(Step):
             record.info['VID'] = str(VID)
             record.info['CID'] = CID
             merged_vcf.write(record)
-
         merged_vcf.close()
-        self.vcf = merged_vcf_file
-
 
+    @utils.add_log
     def write_VID_file(self):
-        df_VID = self.df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']]
+        df_vcf = parse_vcf(self.final_vcf_file)
+        df_VID = df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']]
         df_VID.to_csv(self.VID_file, sep='\t', index=False)
 
-
+    @utils.add_log
     def add_VID(self):
-        vcf = pysam.VariantFile(self.vcf,'r')
+        vcf = pysam.VariantFile(self.args.vcf,'r')
         vcf_header = vcf.header
         if 'VID' in vcf_header.info:
             logging.info('VID is already in vcf file!')
             return
         vcf_header.info.add('VID', number=1, type='String', description='Variant ID')
-        VID_vcf = pysam.VariantFile(self.VID_vcf_file, 'w', header=vcf_header)
+        VID_vcf = pysam.VariantFile(self.final_vcf_file, 'w', header=vcf_header)
         VID = 0
         for rec in vcf.fetch():
             VID += 1
             rec.info['VID'] = str(VID) 
             VID_vcf.write(rec)
         VID_vcf.close()
-        self.vcf = self.VID_vcf_file
 
+    @staticmethod
     @utils.add_log
-    def cell_UMI(self, CID):
+    def cell_UMI(CID, outdir, final_vcf_file):
+        df_vcf = parse_vcf(final_vcf_file)
         df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count'])
-        norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
-        df_cell_vcf = self._parse_vcf(norm_all_vcf, infos=['DP4'])
+        norm_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf'
+        df_cell_vcf = parse_vcf(norm_all_vcf, infos=['DP4'])
 
         def get_DP4(row, alt):
             DP4 = row['DP4'].iloc[0]
@@ -339,8 +339,8 @@ class Variant_calling(Step):
                 alt_UMI = get_DP4(df_alt, 'alt')
             return ref_UMI, alt_UMI, pos, chrom, alt
 
-        for index in self.df_vcf.index:
-            row = self.df_vcf.loc[index,]
+        for index in df_vcf.index:
+            row = df_vcf.loc[index,]
             ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf)
             if (ref_UMI + alt_UMI) != 0:
                 VID = row['VID']
@@ -361,15 +361,19 @@ class Variant_calling(Step):
         _df_index, df_valid =  self.read_CID()
 
         df_UMI_list = []
+        CID_arg = list(df_valid.index)
+        outdir_arg = [self.outdir] * len(CID_arg)
+        final_vcf_file_arg = [self.final_vcf_file] * len(CID_arg)
         with ProcessPoolExecutor(self.thread) as pool:
-            for res in pool.map(self.cell_UMI, list(df_valid.index)):
+            for res in pool.map(Variant_calling.cell_UMI, CID_arg, outdir_arg, final_vcf_file_arg):
                 df_UMI_list.append(res)
         
         df_UMI = pd.concat(df_UMI_list)
         df_UMI['VID'] = df_UMI['VID'].astype('int')
         df_UMI.sort_values(by=['VID','CID'], inplace=True)
         df_UMI.to_csv(self.variant_count_file, sep='\t', index=False)
-   
+    
+    @utils.add_log
     def write_support_matrix(self):
         def set_support_bit(row):
             ref_bit = 1 if row['ref_count'] > 0 else 0
@@ -378,11 +382,11 @@ class Variant_calling(Step):
             return support_bit
 
         df_variant_count = pd.read_csv(self.variant_count_file, sep='\t')
-        df_variant_count['support'] = self.df_variant_count.apply(set_support_bit, axis=1)
+        df_variant_count['support'] = df_variant_count.apply(set_support_bit, axis=1)
         support_mtx = coo_matrix(
             (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1))
         )       
-        mmwrite(self.support_mtx_file, support_mtx)
+        mmwrite(self.support_matrix_file, support_mtx)
                     
 
     def run(self):
@@ -393,10 +397,10 @@ class Variant_calling(Step):
             self.add_VID()
         else:
             self.merge_vcf()
-        self.parse_vcf()
         self.write_VID_file()
         self.get_UMI()
         self.write_support_matrix()
+        self.clean_up()
 
 
 @utils.add_log
diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index ae938ad0..89e608cc 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -327,7 +327,7 @@ job_end
             os.system('mkdir -p ./shell/')
             for sample in self.shell_dict:
                 with open(f'./shell/{sample}.sh', 'w') as f:
-                    f.write("set -e\n")
+                    f.write("set -eo pipefail\n")
                     f.write(self.shell_dict[sample])
 
     def run(self):
diff --git a/release_local.py b/release_local.py
index 1cbbd0e3..31998936 100755
--- a/release_local.py
+++ b/release_local.py
@@ -9,7 +9,7 @@ CONDA_ROOT = '/SGRNJ/Public/Software/conda_env/'
 @add_log
 def create_conda():
     cmd = f"""
-    set -e
+    set -eo pipefail
     conda create -n {ENV_NAME}
     source activate {ENV_NAME}
     conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing
@@ -25,7 +25,7 @@ def create_conda():
 @add_log
 def lint_code():
     cmd = """
-    set -e
+    set -eo pipefail
     celescope -h
     pip install -i https://pypi.mirrors.ustc.edu.cn/simple/ pylint
     # lint
-- 
Gitee


From ea713c9b63b95fbca326ad4124a4683b80355f84 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 17 Jun 2021 10:56:13 +0800
Subject: [PATCH 46/96] refactor target_metrics

---
 celescope/__init__.py             |  2 +-
 celescope/snp/variant_calling.py  |  4 +-
 celescope/tools/target_metrics.py | 87 +++++++++++++++++++------------
 docs/snp/snpCalling.md            | 21 --------
 docs/snp/variant_calling.md       | 38 ++++++++++++++
 docs/tools/target_metrics.md      | 21 +++++---
 6 files changed, 109 insertions(+), 64 deletions(-)
 delete mode 100644 docs/snp/snpCalling.md
 create mode 100644 docs/snp/variant_calling.md

diff --git a/celescope/__init__.py b/celescope/__init__.py
index bea81b71..9d9c8f0f 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -25,5 +25,5 @@ RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ]
 HELP_DICT = {
     'match_dir': 'Match celescope scRNA-Seq directory.',
     'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.',
-
+    'genomeDir': 'Genome directory after running `mkref`.',
 }
diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index 2edc3e24..9b564e38 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -45,7 +45,7 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
 class Variant_calling(Step):
     """
     Features
-    - Perform variant calling
+    - Perform variant calling.
 
     Output
 
@@ -413,7 +413,7 @@ def variant_calling(args):
 
 def get_opts_variant_calling(parser, sub_program):
 
-    parser.add_argument("--genomeDir", help='Genome directory', required=True)
+    parser.add_argument("--genomeDir", help=HELP_DICT['genomeDir'], required=True)
     parser.add_argument(
         "--vcf", 
         help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py
index 1ac9dfed..9101d812 100755
--- a/celescope/tools/target_metrics.py
+++ b/celescope/tools/target_metrics.py
@@ -8,26 +8,37 @@ from celescope.__init__ import HELP_DICT
 
 
 class Target_metrics(Step):
+    """
+    Features
+    - Filter bam file
+        - Filter reads that are not cell-associated.
+        - Filter reads that are not mapped to target genes. 
+
+    - Collect enrichment metrics.
+
+    Output
+    - `filtered.bam` BAM file after filtering.
+    """
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
-        self.gene_list = args.gene_list
-        self.match_dir = args.match_dir
-        self.bam = args.bam
-        self.out_bam_file = f'{self.outdir}/{self.sample}_filtered.bam'
 
+        # set
+        self.match_barcode = set(utils.parse_match_dir(args.match_dir)["match_barcode"])
+        self.gene_list, self.n_gene = utils.read_one_col(args.gene_list)
+        self.count_dict = utils.genDict(dim=3, valType=int)
 
-    def run(self):
-        gene_list, n_gene = utils.read_one_col(self.gene_list)
         self.add_metric(
             name="Number of Target Genes",
-            value=n_gene,
+            value=self.n_gene,
         )
 
-        match_barcode = set(utils.parse_match_dir(self.match_dir)["match_barcode"])
-        count_dict = utils.genDict(dim=3, valType=int)
-
+        # out file
+        self.out_bam_file = f'{self.out_prefix}_filtered.bam'
 
-        with pysam.AlignmentFile(self.bam, "rb") as reader:
+    @utils.add_log
+    def read_bam_write_filtered(self):
+        with pysam.AlignmentFile(self.args.bam, "rb") as reader:
             with pysam.AlignmentFile(self.out_bam_file, "wb", header=reader.header) as writer:
                 for record in reader:
                     try:
@@ -36,49 +47,57 @@ class Target_metrics(Step):
                         continue
                     barcode = record.get_tag('CB')
                     UMI = record.get_tag('UB')
-                    if barcode in match_barcode and gene_name in gene_list:
+                    if barcode in self.match_barcode and gene_name in self.gene_list:
                         writer.write(record)
-                    count_dict[barcode][gene_name][UMI] += 1
+                    self.count_dict[barcode][gene_name][UMI] += 1
 
-        UMIs = 0
+    @utils.add_log
+    def parse_count_dict_add_metrics(self):        
+        total_UMIs = 0
         enriched_UMIs = 0
         enriched_UMIs_in_cells = 0
-        enriched_UMIs_per_cell = []
-
-
-        for barcode in count_dict:
-            barcode_enriched_UMI = 0
-            for gene_name in count_dict[barcode]:
-                gene_UMI = len(count_dict[barcode][gene_name])
-                UMIs += gene_UMI
-                if gene_name in gene_list:
+        enriched_UMIs_per_cell_list = []
+
+        for barcode in self.count_dict:
+            cell_enriched_UMI = 0
+            for gene_name in self.count_dict[barcode]:
+                gene_UMI = len(self.count_dict[barcode][gene_name])
+                total_UMIs += gene_UMI
+                if gene_name in self.gene_list:
                     enriched_UMIs += gene_UMI
-                    if barcode in match_barcode:
+                    if barcode in self.match_barcode:
                         enriched_UMIs_in_cells += gene_UMI
-                        barcode_enriched_UMI += gene_UMI
-            if barcode in match_barcode:
-                enriched_UMIs_per_cell.append(barcode_enriched_UMI)
-        target_metrics.logger.debug(enriched_UMIs_per_cell)
+                        cell_enriched_UMI += gene_UMI
+
+            if barcode in self.match_barcode:
+                enriched_UMIs_per_cell_list.append(cell_enriched_UMI)
+
+        self.add_metric(
+            name="Total UMIs",
+            value=total_UMIs,
+        )    
 
         self.add_metric(
             name="Enriched UMIs",
             value=enriched_UMIs,
-            total=UMIs,
+            total=total_UMIs,
         )
         self.add_metric(
             name="Enriched UMIs in Cells",
             value=enriched_UMIs_in_cells,
-            total=UMIs,
+            total=total_UMIs,
         )
         self.add_metric(
             name="Median Enriched UMIs per Cell",
-            value=np.median(enriched_UMIs_per_cell),
+            value=np.median(enriched_UMIs_per_cell_list),
         )
 
+    def run(self):
+        self.read_bam_write_filtered()
+        self.parse_count_dict_add_metrics()
         self.clean_up()
 
 
-
 @utils.add_log
 def target_metrics(args):
     step_name = "target_metrics"
@@ -89,7 +108,7 @@ def target_metrics(args):
 def get_opts_target_metrics(parser, sub_program):
     parser.add_argument("--gene_list", help=HELP_DICT['gene_list'], required=True)
     if sub_program:
-        parser.add_argument("--bam", help='featureCounts bam', required=True)
-        parser.add_argument('--match_dir', help='match_dir', required=True)
+        parser.add_argument("--bam", help='Input bam file', required=True)
+        parser.add_argument('--match_dir', help=HELP_DICT['match_dir'], required=True)
         parser = s_common(parser)
 
diff --git a/docs/snp/snpCalling.md b/docs/snp/snpCalling.md
deleted file mode 100644
index 3814465a..00000000
--- a/docs/snp/snpCalling.md
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--bam` featureCounts bam
-
-`--match_dir` match scRNA-Seq dir
-
-`--genomeDir` genomeDir
-
-`--gene_list` gene_list
-
diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md
new file mode 100644
index 00000000..aed2d6fa
--- /dev/null
+++ b/docs/snp/variant_calling.md
@@ -0,0 +1,38 @@
+## Features
+- Perform variant calling.
+
+## Output
+
+`{sample}_VID.tsv` A unique numeric ID is assigned for each variant.
+
+`{sample}_CID.tsv` A unique numeric ID is assigned for each cell.
+
+`{sample}_variant_count.tsv`  Reference and variant supporting reads/UMIs count.
+
+`{sample}_support.mtx` Support matrix, only high quality bases are considered.   
+0 : no reads/UMIs cover the position.  
+1 : all reads/UMIs at the position support the ref allele.  
+2 : all reads/UMIs at the position support the alt allele.  
+3 : one or more reads/UMIs support both the alt and the ref allele.  
+
+
+## Arguments
+`--genomeDir` Genome directory after running `mkref`.
+
+`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
+and use these variants as input vcf.
+
+`--bam` Input BAM file from step `target_metrics`.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md
index 751de192..d2fbaa04 100644
--- a/docs/tools/target_metrics.md
+++ b/docs/tools/target_metrics.md
@@ -1,6 +1,21 @@
+## Features
+- Filter bam file
+    - Filter reads that are not cell-associated.
+    - Filter reads that are not mapped to target genes. 
+
+- Collect enrichment metrics.
+
+## Output
+- `filtered.bam` BAM file after filtering.
 
 
 ## Arguments
+`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
+
+`--bam` Input bam file
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
 `--outdir` Output diretory.
 
 `--assay` Assay name.
@@ -11,9 +26,3 @@
 
 `--debug` If this argument is used, celescope may output addtional file for debugging.
 
-`--bam` featureCounts bam
-
-`--match_dir` match_dir
-
-`--gene_list` gene_list
-
-- 
Gitee


From 328a45c371c19ce25bceaa73b3eb39817e00abdb Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 17 Jun 2021 14:29:44 +0800
Subject: [PATCH 47/96] update

---
 celescope/snp/multi_snp.py        |  2 +-
 celescope/tests/test_multi.py     |  5 +----
 celescope/tools/star_mixin.py     | 15 +++++----------
 celescope/tools/target_metrics.py | 17 ++++++++++++++---
 celescope/tools/utils.py          | 14 ++++++++++++++
 5 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index ea96c69c..bad75811 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -34,7 +34,7 @@ class Multi_snp(Multi):
     def variant_calling(self, sample):
         step = 'variant_calling'
         cmd_line = self.get_cmd_line(step, sample)
-        bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered.bam'
+        bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered_sorted.bam'
         cmd = (
             f'{cmd_line} '
             f'--bam {bam} '
diff --git a/celescope/tests/test_multi.py b/celescope/tests/test_multi.py
index f1ae8032..68c9f23d 100755
--- a/celescope/tests/test_multi.py
+++ b/celescope/tests/test_multi.py
@@ -3,7 +3,6 @@ Integration tests
 """
 
 import os
-import shutil
 import subprocess
 from concurrent import futures
 
@@ -14,7 +13,7 @@ ASSAYS = [
     'vdj',
     'tag',
     'capture_virus',
-    #'snp',
+    'snp',
     'rna',
 ]
 
@@ -28,8 +27,6 @@ def run_single(assay, test_dir):
     print("*" * 20 + "running " + assay + "*" * 20)
     subprocess.check_call('sh run_shell.sh', shell=True)
     subprocess.check_call('sh sjm.sh', shell=True)
-    if os.path.exists("test1"):
-        shutil.rmtree("test1")
     try:
         subprocess.check_call('sh ./shell/test1.sh', shell=True)
     except subprocess.CalledProcessError:
diff --git a/celescope/tools/star_mixin.py b/celescope/tools/star_mixin.py
index 522eada1..3694959d 100755
--- a/celescope/tools/star_mixin.py
+++ b/celescope/tools/star_mixin.py
@@ -64,20 +64,15 @@ class StarMixin():
 
     @utils.add_log
     def sort_bam(self):
-        cmd = (
-            f'samtools sort {self.unsort_STAR_bam} '
-            f'-o {self.STAR_bam} '
-            f'--threads {self.thread} '
+        utils.sort_bam(
+            self.unsort_STAR_bam,
+            self.STAR_bam,
+            threads=self.thread,
         )
-        StarMixin.sort_bam.logger.info(cmd)
-        subprocess.check_call(cmd, shell=True)
 
     @utils.add_log
     def index_bam(self):
-        cmd = f"samtools index {self.STAR_bam}"
-        StarMixin.index_bam.logger.info(cmd)
-        subprocess.check_call(cmd, shell=True)
-    
+        utils.index_bam(self.STAR_bam)    
 
     def get_star_metrics(self):
         """
diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py
index 9101d812..5fe8aa39 100755
--- a/celescope/tools/target_metrics.py
+++ b/celescope/tools/target_metrics.py
@@ -24,7 +24,7 @@ class Target_metrics(Step):
         Step.__init__(self, args, step_name)
 
         # set
-        self.match_barcode = set(utils.parse_match_dir(args.match_dir)["match_barcode"])
+        self.match_barcode, _num = utils.read_barcode_file(args.match_dir)
         self.gene_list, self.n_gene = utils.read_one_col(args.gene_list)
         self.count_dict = utils.genDict(dim=3, valType=int)
 
@@ -35,6 +35,7 @@ class Target_metrics(Step):
 
         # out file
         self.out_bam_file = f'{self.out_prefix}_filtered.bam'
+        self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam'
 
     @utils.add_log
     def read_bam_write_filtered(self):
@@ -45,8 +46,12 @@ class Target_metrics(Step):
                         gene_name = record.get_tag('GN')
                     except KeyError:
                         continue
-                    barcode = record.get_tag('CB')
-                    UMI = record.get_tag('UB')
+                    # compatible with 10X bam
+                    try:
+                        barcode = record.get_tag('CB')
+                        UMI = record.get_tag('UB')
+                    except KeyError:
+                        continue
                     if barcode in self.match_barcode and gene_name in self.gene_list:
                         writer.write(record)
                     self.count_dict[barcode][gene_name][UMI] += 1
@@ -95,6 +100,12 @@ class Target_metrics(Step):
     def run(self):
         self.read_bam_write_filtered()
         self.parse_count_dict_add_metrics()
+        utils.sort_bam(
+            self.out_bam_file,
+            self.out_bam_file_sorted,
+            threads=self.thread,
+        )
+        utils.index_bam(self.out_bam_file_sorted)
         self.clean_up()
 
 
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 6741903a..8715e4c5 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -809,3 +809,17 @@ def find_step_module_with_folder(assay, step):
             folder = module_path.split('.')[1]
 
     return step_module, folder
+
+
+def sort_bam(input_bam, output_bam, threads=1):
+    cmd = (
+        f'samtools sort {input_bam} '
+        f'-o {output_bam} '
+        f'--threads {threads} '
+    )
+    subprocess.check_call(cmd, shell=True)
+
+
+def index_bam(input_bam):
+    cmd = f"samtools index {input_bam}"
+    subprocess.check_call(cmd, shell=True)
\ No newline at end of file
-- 
Gitee


From 078c0058f2607b8ecac688fdf2546e49b30add64 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 17 Jun 2021 16:23:52 +0800
Subject: [PATCH 48/96] add split tag

---
 celescope/tag/__init__.py  |  2 +-
 celescope/tag/multi_tag.py | 12 +++++++
 celescope/tag/split_tag.py | 73 ++++++++++++++++++++++++++++++++++++++
 celescope/tools/utils.py   |  6 ++--
 4 files changed, 89 insertions(+), 4 deletions(-)
 create mode 100644 celescope/tag/split_tag.py

diff --git a/celescope/tag/__init__.py b/celescope/tag/__init__.py
index 3b73e247..2a655dc2 100755
--- a/celescope/tag/__init__.py
+++ b/celescope/tag/__init__.py
@@ -1,2 +1,2 @@
-__STEPS__ = ['sample', 'barcode', 'cutadapt', 'mapping_tag', 'count_tag', 'analysis_tag']
+__STEPS__ = ['sample', 'barcode', 'cutadapt', 'mapping_tag', 'count_tag', 'analysis_tag', 'split_tag']
 __ASSAY__ = 'tag'
diff --git a/celescope/tag/multi_tag.py b/celescope/tag/multi_tag.py
index bae3cc15..0686fc6b 100755
--- a/celescope/tag/multi_tag.py
+++ b/celescope/tag/multi_tag.py
@@ -37,6 +37,18 @@ class Multi_tag(Multi):
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
+    def split_tag(self, sample):
+        step = 'split_tag'
+        umi_tag_file = f'{self.outdir_dic[sample]["count_tag"]}/{sample}_umi_tag.tsv'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--match_dir {self.col4_dict[sample]} '
+            f'--umi_tag_file {umi_tag_file} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=1)
+
+
 
 def main():
     multi = Multi_tag(__ASSAY__)
diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py
new file mode 100644
index 00000000..aec7d977
--- /dev/null
+++ b/celescope/tag/split_tag.py
@@ -0,0 +1,73 @@
+"""
+split scRNA-Seq fastq file(01.barcode/{sample}_2.fq)
+"""
+import glob
+import os
+
+import pysam
+import pandas as pd
+
+import celescope.tools.utils as utils
+from celescope.tools.step import Step, s_common
+from celescope.__init__ import HELP_DICT
+
+class Split_tag(Step):
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        # set
+
+        df_umi_tag = pd.read_csv(args.umi_tag_file, sep='\t', index_col=0)
+        df_umi_tag = df_umi_tag.rename_axis('barcode').reset_index()
+        self.tag_barcode_dict = {tag: set(row["barcode"].tolist()) for tag, row in df_umi_tag.groupby("tag")}
+
+        if args.split_fastq:
+            self.rna_fq_file = glob.glob(f'{args.match_dir}/*barcode/*_2.fq*')[0]
+
+            fastq_outdir = f'{args.outdir}/fastqs/'
+            os.system(f'mkdir -p {fastq_outdir}')
+            self.fastq_files_handle = {}
+            for tag in self.tag_barcode_dict:
+                fastq_file_name = f'{fastq_outdir}/{tag}_2.fq'
+                self.fastq_files_handle[tag] = open(fastq_file_name, 'w')
+
+    @utils.add_log
+    def write_fastq_files(self):
+        read_num = 0
+        with pysam.FastxFile(self.rna_fq_file, 'r') as rna_fq:
+            for read in rna_fq:
+                read_num += 1
+                attr = read.name.strip("@").split("_")
+                barcode = attr[0]
+                for tag in self.tag_barcode_dict:
+                    if barcode in self.tag_barcode_dict[tag]:
+                        self.fastq_files_handle[tag].write(str(read))
+
+                if read_num % 1000000 == 0:
+                    self.write_fastq_files.logger.info(f'{read_num} done')
+
+        for tag in self.tag_barcode_dict:
+            self.fastq_files_handle[tag].close()
+
+
+    @utils.add_log
+    def run(self):
+        if self.args.split_fastq:
+            self.write_fastq_files()
+
+def split_tag(args):
+    step_name = "split_tag"
+    runner = Split_tag(args, step_name)
+    runner.run()
+
+def get_opts_split_tag(parser, sub_program):
+    parser.add_argument(
+        "--split_fastq", 
+        help="Split scRNA-Seq fastq file(01.barcode/{sample}_2.fq).",
+        action='store_true',
+    )
+    if sub_program:
+        parser.add_argument("--umi_tag_file", help="UMI tag file", required=True)
+        parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True)
+        s_common(parser)
+
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 8715e4c5..95c01547 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -173,11 +173,11 @@ def link_data(outdir, fq_dict):
             fh.write('ln -sf %s %s\n' % (arr[1], s + '_2.fq.gz'))
 
 
-def generic_open(file_name, mode='rt'):
+def generic_open(file_name, *args, **kwargs):
     if file_name.endswith('.gz'):
-        file_obj = gzip.open(file_name, mode)
+        file_obj = gzip.open(file_name, *args, **kwargs)
     else:
-        file_obj = open(file_name, mode)
+        file_obj = open(file_name, *args, **kwargs)
     return file_obj
 
 @add_log
-- 
Gitee


From 40004bf55a722d6edb4756a7d6b0c7627bfcb7f2 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 18 Jun 2021 08:44:06 +0800
Subject: [PATCH 49/96] 1.3.2b1

---
 celescope/__init__.py      | 2 +-
 celescope/tag/split_tag.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index 9d9c8f0f..133775be 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -1,6 +1,6 @@
 import os
 
-__VERSION__ = "1.3.2b0"
+__VERSION__ = "1.3.2b1"
 __version__ = __VERSION__
 
 ASSAY_DICT = {
diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py
index aec7d977..2510fd4d 100644
--- a/celescope/tag/split_tag.py
+++ b/celescope/tag/split_tag.py
@@ -41,7 +41,7 @@ class Split_tag(Step):
                 barcode = attr[0]
                 for tag in self.tag_barcode_dict:
                     if barcode in self.tag_barcode_dict[tag]:
-                        self.fastq_files_handle[tag].write(str(read))
+                        self.fastq_files_handle[tag].write(str(read) + '\n')
 
                 if read_num % 1000000 == 0:
                     self.write_fastq_files.logger.info(f'{read_num} done')
-- 
Gitee


From a2bdec47ac8b2c8ad683c77941b3ec40f04f198c Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 18 Jun 2021 11:34:48 +0800
Subject: [PATCH 50/96] update

---
 celescope/tag/split_tag.py |  41 +++++++++++++++++++++++++++++--------
 release_local.py           |   2 +-
 wdl/wdl.zip                | Bin 17406 -> 17406 bytes
 3 files changed, 33 insertions(+), 10 deletions(-)

diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py
index 2510fd4d..0cfb5ccb 100644
--- a/celescope/tag/split_tag.py
+++ b/celescope/tag/split_tag.py
@@ -3,6 +3,7 @@ split scRNA-Seq fastq file(01.barcode/{sample}_2.fq)
 """
 import glob
 import os
+from collections import defaultdict
 
 import pysam
 import pandas as pd
@@ -26,34 +27,55 @@ class Split_tag(Step):
 
             fastq_outdir = f'{args.outdir}/fastqs/'
             os.system(f'mkdir -p {fastq_outdir}')
-            self.fastq_files_handle = {}
+
+            self.r2_fastq_files_handle = {}
+            self.r1_fastq_files_handle = {}
             for tag in self.tag_barcode_dict:
-                fastq_file_name = f'{fastq_outdir}/{tag}_2.fq'
-                self.fastq_files_handle[tag] = open(fastq_file_name, 'w')
+                r2_fastq_file_name = f'{fastq_outdir}/{tag}_2.fq'
+                self.r2_fastq_files_handle[tag] = open(r2_fastq_file_name, 'w')
+                r1_fastq_file_name = f'{fastq_outdir}/{tag}_1.fq'
+                self.r1_fastq_files_handle[tag] = open(r1_fastq_file_name, 'w')
+
+            self.tag_read_index_dict = defaultdict(set)
+
 
     @utils.add_log
-    def write_fastq_files(self):
+    def write_r2_fastq_files(self):
         read_num = 0
         with pysam.FastxFile(self.rna_fq_file, 'r') as rna_fq:
             for read in rna_fq:
                 read_num += 1
                 attr = read.name.strip("@").split("_")
                 barcode = attr[0]
+                read_index = int(attr[2])
                 for tag in self.tag_barcode_dict:
                     if barcode in self.tag_barcode_dict[tag]:
-                        self.fastq_files_handle[tag].write(str(read) + '\n')
+                        self.tag_read_index_dict[tag].add(read_index)
+                        self.r2_fastq_files_handle[tag].write(str(read) + '\n')
 
                 if read_num % 1000000 == 0:
-                    self.write_fastq_files.logger.info(f'{read_num} done')
+                    self.write_r2_fastq_files.logger.info(f'{read_num} done')
 
-        for tag in self.tag_barcode_dict:
-            self.fastq_files_handle[tag].close()
+        for tag in self.r2_fastq_files_handle:
+            self.r2_fastq_files_handle[tag].close()
+
+    @utils.add_log
+    def write_r1_fastq_files(self):
+        with pysam.FastxFile(self.args.R1_read, 'r') as r1_read:
+            for read_index, read in enumerate(r1_read, start=1):
+                for tag in self.tag_read_index_dict:
+                    if read_index in self.tag_read_index_dict[tag]:
+                        self.r1_fastq_files_handle[tag].write(str(read) + '\n')
+        
+        for tag in self.r1_fastq_files_handle:
+            self.r1_fastq_files_handle[tag].close()
 
 
     @utils.add_log
     def run(self):
         if self.args.split_fastq:
-            self.write_fastq_files()
+            self.write_r2_fastq_files()
+            self.write_r1_fastq_files()
 
 def split_tag(args):
     step_name = "split_tag"
@@ -69,5 +91,6 @@ def get_opts_split_tag(parser, sub_program):
     if sub_program:
         parser.add_argument("--umi_tag_file", help="UMI tag file", required=True)
         parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True)
+        parser.add_argument("--R1_read", help='R1 read path')
         s_common(parser)
 
diff --git a/release_local.py b/release_local.py
index 31998936..9686de9b 100755
--- a/release_local.py
+++ b/release_local.py
@@ -14,7 +14,7 @@ def create_conda():
     source activate {ENV_NAME}
     conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing
 
-    pip install -i https://pypi.mirrors.ustc.edu.cn/simple/ celescope
+    pip install --no-cache-dir -i https://pypi.mirrors.ustc.edu.cn/simple/ celescope
     python setup.py install
     ln -s /SGRNJ/Database/script/soft/gatk-4.1.8.1/gatk {CONDA_ROOT}/{ENV_NAME}/bin/gatk
     """
diff --git a/wdl/wdl.zip b/wdl/wdl.zip
index 03a2bedfffa11a0f4ff41ab71fbda0203921f927..97cc01e19dfac575d630b594f96a988f0fb10858 100644
GIT binary patch
delta 312
zcmey@&iJpLkvG7bnT3l11h#mdn#e1~?wo!wA-nbDL_H_w@Kq-#&ta3BtZv6QalQic
zxzi^n9@+?^HkUD;V+1qYnHPf@li#x30SN%<y2)N_{!qRCU`6lPmO>Ow<Jiv(v}SSy
zryAHEkWsCiPasC+a2qj!l`rI-$fyof7WCC-jYkX<1H(B^1_pKp8HS>~M7{EqoX`+X
z24;{mLE0nvXEH%8lbXygqzg8|UT7sVnDJWlIK<|y;yqvi7YP?tuoJp%H9%%eHgp!-
ge8$#_87#=@XaW^fn4I9a5G=;xqzzZ-;k1nr0AK2VumAu6

delta 312
zcmey@&iJpLkvG7bnT3l11iAxGOyrefSG}+=;fU6OiF!`VI@|{)&ta3BtZv6QalQg`
znEZi>hc<$!&1H<|7{Ls8=EY#f<hLw$KmtIzZn77fKUA+jSkXJSr4U8aIQBD_^qrg>
z!Kucs-G3m#Bj~{9R?a67qjI>7n83;x@=jz_uk1S+^wnmKM+_4K!#Pd{26hG+hN8Sg
zz4DZt&=5`rW>cVckV_)@XEH%8lbXygqzg8|UT7sVnDJWlIK<|y;yqvi7YP?tW}r_e
zciU<(1HC%g&{=Hr8Cxr6u)8=NO_+i10tzZjPH<ev40Oa~7AI|xWnhIKPTLp(=@4vS

-- 
Gitee


From ce9b6308740c47fcaab99e7aff10c0736cafec27 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 13:09:32 +0800
Subject: [PATCH 51/96] fix

---
 celescope/templates/html/tracer_vdj/base.html | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html
index 5318bb34..335cd104 100755
--- a/celescope/templates/html/tracer_vdj/base.html
+++ b/celescope/templates/html/tracer_vdj/base.html
@@ -137,7 +137,7 @@
       {% include "html/common/cutadapt_summary.html"%}
       {% endif %}
 
-      {% if split_fastq is defined %}
+      {% if split_fastq_summary is defined %}
       {% include "html/tracer_vdj/split_fastq_summary.html"%}
       {% endif %}
 
-- 
Gitee


From 0196a7f9e34ae8cb6c594c68453a0b386703ea3d Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 13:10:38 +0800
Subject: [PATCH 52/96] add convert_summary html

---
 celescope/templates/html/trust_vdj/base.html  | 20 +++++------
 .../html/trust_vdj/convert_summary.html       | 35 +++++++++++++++++++
 2 files changed, 43 insertions(+), 12 deletions(-)
 create mode 100644 celescope/templates/html/trust_vdj/convert_summary.html

diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html
index 5318bb34..83471801 100644
--- a/celescope/templates/html/trust_vdj/base.html
+++ b/celescope/templates/html/trust_vdj/base.html
@@ -129,24 +129,20 @@
       {% include "html/common/sample_summary.html"%}
       {% endif %}
 
-      {% if barcode_summary is defined %}
-      {% include "html/common/barcode_summary.html"%}
+      {% if convert_summary is defined %}
+      {% include "html/trust_vdj/convert_summary.html"%}
       {% endif %}
 
-      {% if cutadapt_summary is defined %}
-      {% include "html/common/cutadapt_summary.html"%}
+      {% if trust_assemble_summary is defined %}
+      {% include "html/trust_vdj/trust_assemble_summary.html"%}
       {% endif %}
 
-      {% if split_fastq is defined %}
-      {% include "html/tracer_vdj/split_fastq_summary.html"%}
+      {% if map_summary is defined %}
+      {% include "html/trust_vdj/map_summary.html"%}
       {% endif %}
 
-      {% if go_assemble_summary is defined %}
-      {% include "html/tracer_vdj/go_assemble_summary.html"%}
-      {% endif %}
-
-      {% if vdj_sum_summary is defined %}
-      {% include "html/tracer_vdj/vdj_sum_summary.html"%}
+      {% if res_filter_summary is defined %}
+      {% include "html/trust_vdj/res_filter_summary.html"%}
       {% endif %}
       
       {% if table_dict is defined %}
diff --git a/celescope/templates/html/trust_vdj/convert_summary.html b/celescope/templates/html/trust_vdj/convert_summary.html
new file mode 100644
index 00000000..4179741f
--- /dev/null
+++ b/celescope/templates/html/trust_vdj/convert_summary.html
@@ -0,0 +1,35 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+	<h2>Demultiplexing    <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+	<div class="box">
+	  <div class="description" style="display: none;">
+		<p><b>Raw Reads</b> : total reads from FASTQ files.</p>
+		<p><b>Valid Reads</b> : reads pass filtering(filtered: reads without poly T, reads without linker, reads without correct barcode or low quality reads).</p>
+		<p><b>Q30 of Barcodes</b> : percent of barcode base pairs with quality scores over Q30.</p>
+		<p><b>Q30 of UMIs</b> : percent of UMI base pairs with quality scores over Q30.</p>
+	  </div>
+	  <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+		{% for item in convert_summary %}
+		  {% if loop.index <= (loop.length+1)/2 %}
+		  <tr>
+			{% for i in item %} 
+			<td>{{ i|e }}</td>
+			{% endfor %}
+		  </tr>
+		  {% endif %}
+		{% endfor %}
+	  </table>
+
+	  <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
+		{% for item in convert_summary %}
+		  {% if loop.index > (loop.length+1)/2 %}
+		  <tr>
+			{% for i in item %} 
+			<td>{{ i|e }}</td>
+			{% endfor %}
+		  </tr>
+		  {% endif %}
+		{% endfor %}
+	  </table>
+	  <div class="clear" ></div>
+	</div>
+  </div>
\ No newline at end of file
-- 
Gitee


From fc79f3591f27c1850c059b71f3873cc06841ce6b Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 13:11:56 +0800
Subject: [PATCH 53/96] fix a bug of  BCR cdr3 aa

---
 celescope/tracer_vdj/go_assemble.py | 1 +
 celescope/tracer_vdj/vdj_sum.py     | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index 3a291d39..d614c90c 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -94,6 +94,7 @@ def assemble_summary(outdir, Seqtype, sample, species):
                             'total_count': total_count,
                         })
             os.system(f'rm {outdir}/BR{locus}.sam')
+            
         go_assemble_summary.insert(0, {
             'item': 'All reads Mapped to IGH, IGL and IGK',
             'count': total_mapped,
diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index 01a25116..e5fd31a8 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -103,7 +103,7 @@ def filtering(Seqtype, ass_dir, outdir):
             tmplist = []
             for nt in ntseqs:
                 nt = Seq(nt)
-                nt = nt.reverse_complement()
+                nt = nt.translate()
                 tmplist.append(str(nt))
             tmp.insert(tmp.shape[1], f'IG{locus}_CDR3aa', tmplist)
 
-- 
Gitee


From 8e9c6ef95e86db034e605fe73d4924b76efa319b Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 13:13:16 +0800
Subject: [PATCH 54/96] change matched fq name

---
 celescope/trust_vdj/res_filter.py     | 11 +++++------
 celescope/trust_vdj/trust_assemble.py |  9 +++++----
 2 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index ce6b1e2d..c524d4e8 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -8,9 +8,9 @@ def beauty_res(outdir, barcode_report):
     res = pd.read_csv(barcode_report, sep='\t')
     rows = res.shape[0]
     loci = ['A', 'B']
-    chians = ['chain2', 'chain1']
+    chains = ['chain2', 'chain1']
     for l in range(len(loci)):
-        chain = chians[l]
+        chain = chains[l]
         locus = loci[l]
 
         Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], []
@@ -65,11 +65,10 @@ class Res_filter(Step):
     def run(self):
         barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
         res = beauty_res(self.outdir, barcode_report)
-        filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')]
-        fre = [''] * filtered.shape[0]
-        filtered.insert(filtered.shape[1], 'Frequent', fre)
+        fre = [''] * res.shape[0]
+        res.insert(res.shape[1], 'Frequent', fre)
 
-        clones = filtered.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'})
+        clones = res.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'})
         clones = clones.sort_values(by='Frequent', ascending=False)
 
         clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t')
diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py
index 1f09ab19..cbe9c25c 100644
--- a/celescope/trust_vdj/trust_assemble.py
+++ b/celescope/trust_vdj/trust_assemble.py
@@ -67,12 +67,12 @@ class Trust_assemble(Step):
         match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1)
 
         cmd1 = (
-            f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R1.fq'
+            f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq'
         )
         os.system(cmd1)
 
         cmd2 = (
-            f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R2.fq'
+            f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq'
         )
         os.system(cmd2)
 
@@ -92,14 +92,15 @@ class Trust_assemble(Step):
             ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa'
         cmd = (
             f'{TRUST} -t {self.thread} '
-            f'-u {self.outdir}/{self.sample}_R2.fq '
-            f'--barcode {self.outdir}/{self.sample}_R1.fq '
+            f'-u {self.outdir}/{self.sample}_matched_R2.fq '
+            f'--barcode {self.outdir}/{self.sample}_matched_R1.fq '
             f'--barcodeRange 0 23 + '
             f'-f {index_file} '
             f'--ref {ref} '
             f'-o {self.sample} --od {self.outdir}/TRUST4' 
         )
 
+        Trust_assemble.run.logger.info(cmd)
         os.system(cmd)
 
         os.remove(f'{self.outdir}/seqlist.txt')
-- 
Gitee


From 7b80c6a5f101b1ea0d64902dfa029d4672cd7a57 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 13:13:54 +0800
Subject: [PATCH 55/96] add seqkt

---
 conda_pkgs.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/conda_pkgs.txt b/conda_pkgs.txt
index 3f032270..22f5fab0 100755
--- a/conda_pkgs.txt
+++ b/conda_pkgs.txt
@@ -10,4 +10,5 @@ r-argparser
 r-tidyverse
 mixcr=3.0.3
 bioconductor-dropletutils
-bcftools==1.9
\ No newline at end of file
+bcftools==1.9
+seqkt
\ No newline at end of file
-- 
Gitee


From e4a888f65c02d3a37c3d2fd4b900daff4883279c Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 18 Jun 2021 15:51:18 +0800
Subject: [PATCH 56/96] remove glob_genomeDir

---
 celescope/capture_rna/tests.py | 35 ---------------------------------
 celescope/tools/count.py       |  8 ++++----
 celescope/tools/utils.py       | 36 ----------------------------------
 3 files changed, 4 insertions(+), 75 deletions(-)
 delete mode 100755 celescope/capture_rna/tests.py

diff --git a/celescope/capture_rna/tests.py b/celescope/capture_rna/tests.py
deleted file mode 100755
index 644953d2..00000000
--- a/celescope/capture_rna/tests.py
+++ /dev/null
@@ -1,35 +0,0 @@
-import unittest
-
-from celescope.tools.report import reporter
-
-
-class testHLA(unittest.TestCase):
-    def setUp(self):
-        '''
-        os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/')
-        self.sample = 'S20071508_D_TS'
-        count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt'
-        self.df = pd.read_table(count_detail_file, header=0)
-        self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL'
-        self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir)
-        self.outdir = f'{self.sample}/05.count_capture_rna/'
-        self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92'
-        self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') 
-        _refFlat, self.gtf = glob_genomeDir(self.genomeDir)
-        self.assay = 'capture_rna'
-        '''
-
-    @unittest.skip('pass')
-    def test_report(self):
-        t = reporter(assay=self.assay,
-            name='count_capture_rna', sample=self.sample,
-            stat_file=self.outdir + '/stat.txt',
-            outdir=self.outdir + '/..')
-        t.get_report()
-    
-
-
-
-
-if __name__ == '__main__':
-    unittest.main()
\ No newline at end of file
diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index 44997442..f3635ee5 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -21,6 +21,7 @@ from celescope.tools.__init__ import (BARCODE_FILE_NAME, FEATURE_FILE_NAME,
 from celescope.tools.cellranger3 import get_plot_elements
 from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3
 from celescope.tools.step import Step, s_common
+from celescope.rna.mkref import parse_genomeDir_rna
 
 TOOLS_DIR = os.path.dirname(__file__)
 random.seed(0)
@@ -76,10 +77,9 @@ class Count(Step):
         self.cell_calling_method = args.cell_calling_method
         self.expected_cell_num = int(args.expected_cell_num)
         self.bam = args.bam
-        if args.genomeDir and args.genomeDir != "None":
-            _refFlat, self.gtf_file, _ = utils.glob_genomeDir(args.genomeDir)
-        else:
-            self.gtf_file = args.gtf
+
+        # set
+        self.gtf_file = parse_genomeDir_rna(args.genomeDir)['gtf']
         self.id_name = utils.get_id_name_dict(self.gtf_file)
 
         # output files
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 95c01547..e613b0d8 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -500,42 +500,6 @@ def format_ratios(ratios: dict):
         ratios[key] = round(ratios[key] * 100, 2)
 
 
-@add_log
-def glob_genomeDir(genomeDir, fa=False):
-    refFlat = glob.glob(genomeDir + "/*.refFlat")
-    if (len(refFlat) > 1):
-        sys.exit("ERROR: Multiple refFlat file in " + genomeDir)
-    elif (len(refFlat) == 0):
-        sys.exit("ERROR: refFlat file not found in " + genomeDir)
-    else:
-        refFlat = refFlat[0]
-        glob_genomeDir.logger.info("refFlat file found: " + refFlat)
-
-    gtf = glob.glob(genomeDir + "/*.gtf")
-    if (len(gtf) == 0):
-        sys.exit("ERROR: gtf file not found in " + genomeDir)
-    elif (len(gtf) > 1):
-        gtf = glob.glob(genomeDir + "/*.chr.gtf")
-        if (len(gtf) == 0):
-            sys.exit("ERROR: No chr gtf file in "+ genomeDir)
-        if (len(gtf) > 1):
-            sys.exit("ERROR: Multiple gtf file in " + genomeDir)
-        else:
-            gtf = gtf[0]
-            glob_genomeDir.logger.info("chr gtf file found: " + gtf)
-    else:
-        gtf = gtf[0]
-        glob_genomeDir.logger.info("gtf file found: " + gtf)
-    
-    if fa:
-        fasta = glob.glob(genomeDir + "/*.fa") + glob.glob(genomeDir + "/*.fasta")
-        if len(fasta) > 1:
-            sys.exit("ERROR: Multiple fasta file in " + genomeDir)
-        fasta = fasta[0]
-        return refFlat, gtf, fasta
-    return refFlat, gtf, None
-
-
 def get_slope(x, y, window=200, step=10):
     assert len(x) == len(y)
     start = 0
-- 
Gitee


From 3ff563edd5d92bfb65f9bc31aaa3c443f50408db Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 18 Jun 2021 16:28:12 +0800
Subject: [PATCH 57/96] barcode compatible with 10X bam

---
 celescope/snp/variant_calling.py | 8 +++++---
 docs/CHANGELOG.md                | 5 +++++
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index 9b564e38..d36c83c1 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -117,8 +117,11 @@ class Variant_calling(Step):
         samfile = pysam.AlignmentFile(self.splitN_bam, "rb")
         header = samfile.header
         for read in samfile:
-            attr = read.query_name.split('_')
-            barcode = attr[0]
+            try:
+                barcode = read.get_tag('CB')
+                UMI = read.get_tag('UB')
+            except KeyError:
+                continue
             if barcode in self.barcodes:
                 CID = self.barcodes.index(barcode) + 1
                 read.set_tag(tag='CL', value=f'CELL{CID}', value_type='Z')
@@ -308,7 +311,6 @@ class Variant_calling(Step):
         VID_vcf.close()
 
     @staticmethod
-    @utils.add_log
     def cell_UMI(CID, outdir, final_vcf_file):
         df_vcf = parse_vcf(final_vcf_file)
         df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count'])
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
index 4de53cb9..109dd6d7 100755
--- a/docs/CHANGELOG.md
+++ b/docs/CHANGELOG.md
@@ -2,9 +2,14 @@
 
 ## [unreleased] - 2021-06-09
 ### Added
+
 ### Changed
+
 ### Fixed
+- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`.
+
 ### Removed
+- `celescope.tools.utils.glob_genomeDir`
 
 ## [1.3.1] - 2021-06-09
 ### Added
-- 
Gitee


From 6ae9e593b6892ea94ba710f943df34a3e3117726 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 18 Jun 2021 18:11:55 +0800
Subject: [PATCH 58/96] use barcode_report.tsv to summary

---
 celescope/trust_vdj/res_filter.py | 120 ++++++++++++++++++------------
 1 file changed, 74 insertions(+), 46 deletions(-)

diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index c524d4e8..8e876672 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -1,57 +1,78 @@
 import pandas as pd
 from celescope.tools.Step import Step, s_common
 from celescope.tools import utils
+from collections import defaultdict
 
 
 @utils.add_log
-def beauty_res(outdir, barcode_report):
-    res = pd.read_csv(barcode_report, sep='\t')
-    rows = res.shape[0]
-    loci = ['A', 'B']
+def beauty_report(barcode_report):
+    df = pd.read_csv(barcode_report, sep='\t')
+    rows = df.shape[0]
     chains = ['chain2', 'chain1']
-    for l in range(len(loci)):
+    dic = defaultdict(list)
+
+    for l in range(len(chains)):
         chain = chains[l]
-        locus = loci[l]
 
-        Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], []
+        items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'full_length_assembly': -1}        
 
         for i in range(rows):
-            attr = res.loc[i, chain]
-            attrs = attr.split(',')
-            if len(attrs) == 10:
-                V, D, J, C, cdr3nt, cdr3aa, readcount, fl = attrs[0], attrs[1], attrs[2], attrs[3], attrs[4], attrs[5], attrs[6], attrs[-1]
-                Vgenes.append(V)
-                Dgenes.append(D)
-                Jgenes.append(J)
-                Cgenes.append(C)
-                cdr3nts.append(cdr3nt)
-                cdr3aas.append(cdr3aa)
-                readcounts.append(readcount)
-                fuls.append(fl)
-            elif len(attrs) != 10:
-                Vgenes.append('NAN')
-                Dgenes.append('NAN')
-                Jgenes.append('NAN')
-                Cgenes.append('NAN')
-                cdr3nts.append('NAN')
-                cdr3aas.append('NAN')
-                readcounts.append('NAN')
-                fuls.append('NAN')
-            
-        res[f'TR{locus}_V'] = Vgenes
-        res[f'TR{locus}_D'] = Dgenes
-        res[f'TR{locus}_J'] = Jgenes
-        res[f'TR{locus}_C'] = Cgenes
-        res[f'TR{locus}_cdr3nt'] = cdr3nts
-        res[f'TR{locus}_cdr3aa'] = cdr3aas
-        res[f'TR{locus}_readcount'] = readcounts
-        res[f'TR{locus}_fl'] = fuls
-
-    res.to_csv(f'{outdir}/new_barcode_report.tsv', sep='\t')
+            cb = df.loc[i, '#barcode']
+            dic['barcode'].append(cb)
+            for item in items:
+                attr = df.loc[i, chain]
+                attrs = attr.split(',')
+
+                if len(attrs) == 10:
+                    dic[f'{item}'].append(attrs[items[item]])
+
+                elif len(attrs) != 10:
+                    dic[f'{item}'].append('None')
+
+    res = pd.DataFrame(dic, columns=list(dic.keys()))
 
     return res
 
 
+def get_clone_table(df, Seqtype):
+    res = pd.DataFrame()
+    group_type = []
+    if Seqtype == 'TCR':
+        chains = ['TRA', 'TRB']
+    if Seqtype == 'BCR':
+        chains = ['IGH', 'IGL', 'IGK']
+        for chain in chains:
+            tmp = df[df['V'].str.contains(chain, na=False)]
+            tmp = tmp.set_index('barcode')
+            tmp = tmp.rename(columns=lambda x: f'{chain}_'+x)
+
+            res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None')
+            group_type.append(f'{chain}_CDR3aa')
+
+    
+    Frequent = [''] * res.shape[0]
+    res.insert(res.shape[1], 'Frequent', Frequent)
+    clonetypes = res.groupby(group_type).agg({'Frequent': 'count'})
+    clonetypes = clonetypes.sort_values(by='Frequent', ascending=False)
+
+    sum_c = clonetypes['Frequent'].sum()
+    proportions = []
+    for f in list(clonetypes['Frequent']):
+        p = f/sum_c
+        p = p * 100
+        p = round(p, 2)
+        p = str(p) + '%'
+        proportions.append(p)
+    clonetypes['Proportion'] = proportions
+    clonetypes = clonetypes.sort_values(by='Frequent', ascending=False)
+    clonetypes = clonetypes.reset_index()
+    
+    clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
+    clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequent', 'Proportion']))   
+
+    return clonetypes
+
+
 class Res_filter(Step):
 
     def __init__(self, args, step_name):
@@ -59,20 +80,26 @@ class Res_filter(Step):
 
         self.outdir = args.outdir
         self.sample = args.sample
+        self.Seqtype = args.Seqtype
 
 
     @utils.add_log
     def run(self):
         barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
-        res = beauty_res(self.outdir, barcode_report)
-        fre = [''] * res.shape[0]
-        res.insert(res.shape[1], 'Frequent', fre)
+        df = beauty_report(barcode_report)
+        df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t')
 
-        clones = res.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'})
-        clones = clones.sort_values(by='Frequent', ascending=False)
+        clones = get_clone_table(df, self.Seqtype)
 
         clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t')
 
+        title = 'Clonetypes'
+        table_dict = self.get_table(title, 'clonetypes_table', clones)
+
+        self.add_data_item(table_dict=table_dict)
+
+        self.clean_up()
+
 
 @utils.add_log
 def res_filter(args):
@@ -82,5 +109,6 @@ def res_filter(args):
 
 
 def get_opts_res_filter(parser, sub_program):
-	if sub_program:
-		parser = s_common(parser)
\ No newline at end of file
+    parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    if sub_program:
+        parser = s_common(parser)
\ No newline at end of file
-- 
Gitee


From e94485ae8be761110dd919557a10d04466b91388 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Tue, 22 Jun 2021 16:10:48 +0800
Subject: [PATCH 59/96] upgrade pandas; remove mutract

---
 celescope/tools/count.py | 2 +-
 requirements.txt         | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index f3635ee5..26ea2be7 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -338,7 +338,7 @@ class Count(Step):
             os.mkdir(matrix_dir)
 
         df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'})
-        mtx = coo_matrix((df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1])))
+        mtx = coo_matrix((df_UMI.UMI, (df_UMI.index.codes[0], df_UMI.index.codes[1])))
         gene_id = df_UMI.index.levels[0].to_series()
         # add gene symbol
         gene_name = gene_id.apply(lambda x: self.id_name[x])
diff --git a/requirements.txt b/requirements.txt
index 6aa50c92..42f0e02c 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -2,11 +2,9 @@ cutadapt==1.17
 pysam==0.16.0.1
 scipy==1.4.1
 numpy==1.19.5
-pandas==0.23.4
 jinja2>=2.10
-matplotlib==2.2.2
 xopen>=0.5.0
 editdistance>=0.5.3
-mutract
 sklearn==0.0
 plotly==4.14.3
+plotnine==0.8.0
-- 
Gitee


From b62eab8be0eb0a926ff61c0bb4ebac8fe88291b8 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 22 Jun 2021 19:11:22 +0800
Subject: [PATCH 60/96] add report

---
 celescope/__init__.py                         |   4 +-
 celescope/templates/html/trust_vdj/base.html  |   4 -
 .../html/trust_vdj/res_filter_summary.html    |  36 ++++++
 .../trust_vdj/trust_assemble_summary.html     |  40 ++++++
 celescope/trust_vdj/res_filter.py             | 110 ++++++++++++++--
 celescope/trust_vdj/trust_assemble.py         | 117 +++++++++++++++---
 6 files changed, 272 insertions(+), 39 deletions(-)
 create mode 100644 celescope/templates/html/trust_vdj/res_filter_summary.html
 create mode 100644 celescope/templates/html/trust_vdj/trust_assemble_summary.html

diff --git a/celescope/__init__.py b/celescope/__init__.py
index d1f25d8b..983fb9fc 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -14,6 +14,6 @@ ASSAY_DICT = {
     'tag': 'Single Cell tag',
     'citeseq': 'Single Cell CITE-Seq',
     'tcr_fl': 'Single Cell full length TCR',
-    'tracer_vdj': 'Single Cell Full Length vdj assemble',
-    'trust_vdj': 'Single Cell Full Length vdj assemble'
+    'tracer_vdj': 'Single Cell Full Length V(D)J Assemble',
+    'trust_vdj': 'Single Cell Full Length V(D)J Assemble'
 }
diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html
index 83471801..fcd8607c 100644
--- a/celescope/templates/html/trust_vdj/base.html
+++ b/celescope/templates/html/trust_vdj/base.html
@@ -137,10 +137,6 @@
       {% include "html/trust_vdj/trust_assemble_summary.html"%}
       {% endif %}
 
-      {% if map_summary is defined %}
-      {% include "html/trust_vdj/map_summary.html"%}
-      {% endif %}
-
       {% if res_filter_summary is defined %}
       {% include "html/trust_vdj/res_filter_summary.html"%}
       {% endif %}
diff --git a/celescope/templates/html/trust_vdj/res_filter_summary.html b/celescope/templates/html/trust_vdj/res_filter_summary.html
new file mode 100644
index 00000000..f4a403f6
--- /dev/null
+++ b/celescope/templates/html/trust_vdj/res_filter_summary.html
@@ -0,0 +1,36 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+	<h2>Cell    <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+	<div class="box">
+	  <div class="description" style="display: none;">
+		<p>If type is BCR:</p>
+		<p><b>Estimated Number of Cells</b> : Number of cells which contain full length IGH, IGK or IGL.</p>
+		<p><b>Cells with IGH</b>: Cells with full length IGH.</p>
+		<p><b>Cells with IGK</b>: Cells with full length IGK.</p>
+		<p><b>Cells with IGL</b>: Cells with full length IGL.</p>
+		<p><b>Cells with paired IGH and IGK</b>: Cells with paired IGH and IGK.</p>
+		<p><b>Cells with paired IGH and IGL</b>: Cells with paired IGH and IGL.</p>
+		<p>If type is TCR:</p>
+		<p><b>Estimated Number of Cells</b>: Number of cells which contain full length TRA or TRB.</p>
+		<p><b>Cells with TRA</b>: Cells with full length TRA.</p>
+		<p><b>Cells with TRB</b>: Cells with full length TRB.</p>
+		<p><b>Cells with paired TRA and TRB</b>: Cells with paired TRA and TRB.</p>    
+	</div>
+	  <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+		{% for item in res_filter_summary %}
+		  <tr>
+			{% for i in item %} 
+			<td>{{ i|e }}</td>
+			{% endfor %}
+		  </tr>
+		{% endfor %}
+	  </table>
+  
+	  <div id="myDivUMI" style="float: left; margin-left: 3%; margin-top: 1%;margin-right:0%; width: 47%">
+		{{ chart|safe }}
+	  </div>
+  
+	  <hr />
+	  
+	  <div class="clear" ></div>
+	</div>
+  </div>
\ No newline at end of file
diff --git a/celescope/templates/html/trust_vdj/trust_assemble_summary.html b/celescope/templates/html/trust_vdj/trust_assemble_summary.html
new file mode 100644
index 00000000..0d18b19f
--- /dev/null
+++ b/celescope/templates/html/trust_vdj/trust_assemble_summary.html
@@ -0,0 +1,40 @@
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+    <h2>Mapping     <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+    <div class="box">
+      <div class="description" style="display: none;">
+        <p>If type is BCR:</p>
+        <p><b>All reads Mapped to IGH, IGL and IGK</b>: reads confidently mapped to IGH, IGL and IGK chain.</p>        
+        <p><b>reads Mapped to IGH</b>: reads confidently mapped to IGH chain.</p>
+        <p><b>reads Mapped to IGL</b>: reads confidently mapped to IGL chain.</p>
+        <p><b>reads Mapped to IGK</b>: reads confidently mapped to IGK chain.</p>
+        <p>If type is TCR:</p>
+        <p><b>All reads Mapped to TRA and TRB</b>: reads confidently mapped to TRA and TRB chain.</p>        
+        <p><b>reads Mapped to TRA</b>: reads confidently mapped to TRA chain.</p>
+        <p><b>reads Mapped to TRB</b>: reads confidently mapped to TRB chain.</p>
+    </div>
+      <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
+        {% for item in trust_assemble_summary %}
+          {% if loop.index <= (loop.length+1)/2 %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+
+      <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
+        {% for item in trust_assemble_summary %}
+          {% if loop.index > (loop.length+1)/2 %}
+          <tr>
+            {% for i in item %} 
+            <td>{{ i|e }}</td>
+            {% endfor %}
+          </tr>
+          {% endif %}
+        {% endfor %}
+      </table>
+      <div class="clear" ></div>
+    </div>
+  </div>
\ No newline at end of file
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index 8e876672..883ae849 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -2,6 +2,8 @@ import pandas as pd
 from celescope.tools.Step import Step, s_common
 from celescope.tools import utils
 from collections import defaultdict
+from celescope.tools.cellranger3 import get_plot_elements
+import numpy as np
 
 
 @utils.add_log
@@ -35,24 +37,27 @@ def beauty_report(barcode_report):
 
 
 def get_clone_table(df, Seqtype):
+    res_filter_summary = []
+
     res = pd.DataFrame()
     group_type = []
     if Seqtype == 'TCR':
         chains = ['TRA', 'TRB']
+        paired_groups = ['TRA_TRB']
     if Seqtype == 'BCR':
         chains = ['IGH', 'IGL', 'IGK']
-        for chain in chains:
-            tmp = df[df['V'].str.contains(chain, na=False)]
-            tmp = tmp.set_index('barcode')
-            tmp = tmp.rename(columns=lambda x: f'{chain}_'+x)
-
-            res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None')
-            group_type.append(f'{chain}_CDR3aa')
-
+        paired_groups = ['IGH_IHL', 'IGH_IGK']
+    for chain in chains:
+        tmp = df[df['V'].str.contains(chain, na=False)]
+        tmp = tmp.set_index('barcode')
+        tmp = tmp.rename(columns=lambda x: f'{chain}_'+x)
+
+        res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None')
+        group_type.append(f'{chain}_CDR3aa')
     
     Frequent = [''] * res.shape[0]
     res.insert(res.shape[1], 'Frequent', Frequent)
-    clonetypes = res.groupby(group_type).agg({'Frequent': 'count'})
+    clonetypes = res.groupby(group_type, as_index=False).agg({'Frequent': 'count'})
     clonetypes = clonetypes.sort_values(by='Frequent', ascending=False)
 
     sum_c = clonetypes['Frequent'].sum()
@@ -67,10 +72,44 @@ def get_clone_table(df, Seqtype):
     clonetypes = clonetypes.sort_values(by='Frequent', ascending=False)
     clonetypes = clonetypes.reset_index()
     
-    clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))]
-    clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequent', 'Proportion']))   
+    clonetype_ids = [(i+1) for i in clonetypes.index.tolist()]
+    clonetypes['index'] = clonetype_ids
+    clonetypes = clonetypes.rename(columns={'index': 'CloneId'})  
 
-    return clonetypes
+    total_count = int(clonetypes['Frequent'].sum())
+
+    res_filter_summary.append({
+        'item': 'Estimated Number of Cells',
+        'count': total_count,
+        'total_count': np.nan
+    })
+    
+    for group in group_type:
+        chain = group.strip('_CDR3aa')
+        tmp = clonetypes[clonetypes[group]!='None']
+        count = int(tmp['Frequent'].sum())
+        item = f'Cells with {chain}'
+        res_filter_summary.append({
+            'item': item,
+            'count': count,
+            'total_count': total_count
+        })
+
+    for pg in paired_groups:
+        attrs = pg.split('_')
+        chain1 = attrs[0]
+        chain2 = attrs[1]
+        tmp = clonetypes[(clonetypes[f'{chain1}_CDR3aa']!='None') & (clonetypes[f'{chain2}_CDR3aa']!='None')]
+        item = f'Cells with paired {chain1} and {chain2}'
+        count = int(tmp['Frequent'].sum())
+        res_filter_summary.append({
+            'item': item,
+            'count': count,
+            'total_count': total_count
+        })
+
+
+    return clonetypes, res_filter_summary
 
 
 class Res_filter(Step):
@@ -81,15 +120,52 @@ class Res_filter(Step):
         self.outdir = args.outdir
         self.sample = args.sample
         self.Seqtype = args.Seqtype
+        self.full_length = args.full_length
 
 
     @utils.add_log
     def run(self):
         barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
         df = beauty_report(barcode_report)
+
+        if self.full_length:
+            df = df[df['full_length_assembly']=='1']
         df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t')
 
-        clones = get_clone_table(df, self.Seqtype)
+        clones, res_filter_summary = get_clone_table(df, self.Seqtype)
+
+        # plot barcode umi
+        count_file = f'{self.outdir}/../02.trust_assemble/count.txt'
+        df_umi = pd.read_csv(count_file, sep='\t', index_col=False)
+        cells = set(df['barcode'].tolist())
+        df_umi['mark'] = df_umi['barcode'].apply(lambda x: 'CB' if (x in cells) else 'UB')
+        df_umi = df_umi.sort_values(by='UMI', ascending=False)
+        df_umi.to_csv(count_file, sep='\t', index=False)
+
+        self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_file))        
+
+        if self.Seqtype == 'TCR':
+            chains = ['TRA', 'TRB']
+        elif self.Seqtype == 'BCR':
+            chains = ['IGH', 'IGL', 'IGK']
+
+        for chain in chains:
+            tmp = df[df['V'].str.contains(chain, na=False)]
+            barcodes = tmp['barcode'].tolist()
+            if len(barcodes) != 0:
+                df_bc = pd.DataFrame(barcodes, columns=['barcode'])
+            else:
+                continue
+
+            tmp_df = pd.merge(df_umi, df_bc, on='barcode', how='inner')
+
+            mid = int(tmp_df['UMI'].median())
+            item = f'Median {chain} UMIs per cell'
+            res_filter_summary.append({
+                'item': item,
+                'count': mid,
+                'total_count': np.nan
+            })
 
         clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t')
 
@@ -98,6 +174,13 @@ class Res_filter(Step):
 
         self.add_data_item(table_dict=table_dict)
 
+
+        stat_file = self.outdir + '/stat.txt'
+
+        sum_df = pd.DataFrame(res_filter_summary, columns=['item', 'count', 'total_count'])
+
+        utils.gen_stat(sum_df, stat_file)
+
         self.clean_up()
 
 
@@ -110,5 +193,6 @@ def res_filter(args):
 
 def get_opts_res_filter(parser, sub_program):
     parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
+    parser.add_argument('--full_length', help='only output full length assembly', action='store_true')
     if sub_program:
         parser = s_common(parser)
\ No newline at end of file
diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py
index cbe9c25c..e052d0ae 100644
--- a/celescope/trust_vdj/trust_assemble.py
+++ b/celescope/trust_vdj/trust_assemble.py
@@ -5,43 +5,109 @@ from celescope.tracer_vdj.split_fastq import get_barcodes
 from celescope.tools.barcode import *
 import pysam
 import pandas as pd
+from collections import defaultdict
 
 
 TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4'
 
 
 def count_fq(fq1):
-    bcs, umis, names = [], [], []
-    count_df = pd.DataFrame()
+    dic = defaultdict(list)
     with pysam.FastxFile(fq1) as fq:
         for entry in fq:
             attr = entry.sequence
             cb = attr[:24]
             umi = attr[24:]
             name = entry.name
-            bcs.append(cb)
-            umis.append(umi)
-            names.append(name)
-    count_df['barcode'] = bcs
-    count_df['UMI'] = umis
-    count_df['seq_name'] = names
+            dic['barcode'].append(cb)
+            dic['UMI'].append(umi)
+            dic['seq_name'].append(name)
+
+    count_df = pd.DataFrame(dic, columns=list(dic.keys()))
     
     return count_df
 
+
 @utils.add_log
 def match_barcodes(outdir, match_dir, Seqtype, fq1):
     annotated_bcs = get_barcodes(match_dir, Seqtype)
     bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode'])
     count_df = count_fq(fq1)
-    df = pd.merge(bcs_df, count_df, on='barcode', how='inner')
-    seqnames = df['seq_name'].tolist()
+
+    # count UMI
+    df_umi = count_df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'})
+    df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'})
+    df_umi = df_umi.sort_values(by='UMI', ascending=False)
+    df_umi.to_csv(f'{outdir}/count.txt', sep='\t', index=False)
+
+    df_n = pd.merge(bcs_df, count_df, on='barcode', how='inner')
+    seqnames = df_n['seq_name'].tolist()
     seqlist = open(f'{outdir}/seqlist.txt', 'w')
     for name in seqnames:
         seqlist.write(str(name) + '\n')
 
-    count_df.to_csv(f'{outdir}/count.txt', sep='\t')
-    df.to_csv(f'{outdir}/matched_count.txt', sep='\t')
+
+def mapping_summary(outdir, Seqtype, fq, species):
     
+    stat_file = outdir + '/stat.txt'
+
+    trust_assemble_summary = []
+
+    total_mapped = 0
+
+    #with pysam.FastxFile(fq) as fh:
+        #total_count = 0
+        #for entry in fh:
+            #total_count += 1
+
+    if Seqtype == 'TCR':
+        loci = ['TRA', 'TRB']
+        stat_string = 'All reads Mapped to TRA and TRB' 
+
+    elif Seqtype == 'BCR':
+        loci = ['IGH', 'IGL', 'IGK']
+        stat_string = 'All reads Mapped to IGH, IGL and IGK'
+
+    for locus in loci:
+        cmd = (
+            f'source activate bracer; '
+            f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+            f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} '
+            f'-U {fq} '
+            f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1'
+        )
+        os.system(cmd)
+
+        with open(f'{outdir}/log') as fh:
+            for line in fh:
+                if 'reads; of these:' in line:
+                    attr = re.findall(r'\d+', line)
+                    total_count = int(attr[0])
+                if 'aligned exactly 1 time' in line:
+                    res = re.findall(r"\d+", line)
+                    item = f'Reads mapped to {locus}'
+                    count = int(res[0])
+                    total_mapped += count
+                    trust_assemble_summary.append({
+                        'item': item,
+                        'count': count,
+                        'total_count': total_count,
+                    })
+
+        os.system(f'rm {outdir}/{locus}.sam')
+
+    trust_assemble_summary.insert(0, {
+        'item': stat_string,
+        'count': total_mapped,
+        'total_count': total_count
+    })
+
+    os.system(f'rm {outdir}/log')
+
+    df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count'])
+
+    utils.gen_stat(df, stat_file)
+
 
 class Trust_assemble(Step):
     """
@@ -60,6 +126,7 @@ class Trust_assemble(Step):
         self.fq2 = args.fq2
         self.sample = args.sample
         self.species = args.species
+        self.speed_up = args.speed_up
 
     
     @utils.add_log
@@ -84,12 +151,12 @@ class Trust_assemble(Step):
 
         species = self.species
 
-        if species =='Mmus':
-            index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/GRCm38_bcrtcr.fa'
-            ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/mouse_IMGT+C.fa'
-        elif species == 'Hsap':
-            index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/hg38_bcrtcr.fa'
-            ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa'
+        index_file = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa'
+        ref = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_IMGT+C.fa'
+
+        string1 = ''
+        if self.speed_up:
+            string1 = '--repseq '
         cmd = (
             f'{TRUST} -t {self.thread} '
             f'-u {self.outdir}/{self.sample}_matched_R2.fq '
@@ -97,14 +164,23 @@ class Trust_assemble(Step):
             f'--barcodeRange 0 23 + '
             f'-f {index_file} '
             f'--ref {ref} '
+            f'{string1}'
             f'-o {self.sample} --od {self.outdir}/TRUST4' 
         )
 
         Trust_assemble.run.logger.info(cmd)
-        os.system(cmd)
+
+        if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'):
+            os.system(cmd)
+
+            #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
+
+        mapping_summary(self.outdir, self.Seqtype, self.fq2, species)
 
         os.remove(f'{self.outdir}/seqlist.txt')
 
+        self.clean_up()
+
 
 @utils.add_log
 def trust_assemble(args):
@@ -120,7 +196,8 @@ def get_opts_trust_assemble(parser, sub_program):
         parser.add_argument('--fq2', help='R2 reads from barcode step', required=True)
         parser.add_argument('--match_dir', help='match_dir', required=True)
     parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
-    parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)        
+    parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
+    parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true')       
 
 
 
-- 
Gitee


From 12e33fb7125cfda9e002c36fa336641ae4f9b365 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 22 Jun 2021 19:12:24 +0800
Subject: [PATCH 61/96] rm Median UMIs per cell

---
 celescope/tracer_vdj/vdj_sum.py | 12 +-----------
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py
index e5fd31a8..daed728d 100644
--- a/celescope/tracer_vdj/vdj_sum.py
+++ b/celescope/tracer_vdj/vdj_sum.py
@@ -216,12 +216,7 @@ class Vdj_sum(Step):
                 'count': paired_cell,
                 'total_count': productive_cells_num,
             })
-
-            vdj_sum_summary.append({
-                'item': 'Median UMIs per cell',
-                'count': median_all,
-                'total_count': np.nan
-            })            
+          
 
             for locus in loci:
                 tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq')
@@ -310,11 +305,6 @@ class Vdj_sum(Step):
                     'total_count': productive_cells_num
             })
 
-            vdj_sum_summary.append({
-                'item': 'Median UMIs per cell',
-                'count': median_all,
-                'total_count': np.nan
-            })
 
             for locus in loci:
                 tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq')
-- 
Gitee


From 035d1c1a6c24731fc29110f89ebfe735ea1daadd Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Tue, 22 Jun 2021 19:12:57 +0800
Subject: [PATCH 62/96] rewrite map

---
 celescope/tracer_vdj/go_assemble.py | 101 ++++++++++------------------
 1 file changed, 36 insertions(+), 65 deletions(-)

diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py
index d614c90c..cda37f1b 100755
--- a/celescope/tracer_vdj/go_assemble.py
+++ b/celescope/tracer_vdj/go_assemble.py
@@ -30,77 +30,48 @@ def assemble_summary(outdir, Seqtype, sample, species):
 
     total_count = count_df['readcount'].sum()
 
+    total_mapped = 0
+
     if Seqtype == 'TCR':
         loci = ['A', 'B']
-
-        total_mapped = 0
-
-        for locus in loci:
-            cmd = (
-                f'source activate {BRACER_CONDA}; '
-                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
-                f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} '
-                f'-U {clean_fq} '
-                f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1'
-            )
-            os.system(cmd)
-            with open(f'{outdir}/log') as fh:
-                for line in fh:
-                    if 'aligned exactly 1 time' in line:
-                        res = re.findall(r"\d+", line)
-                        item = f'Reads mapped to TR{locus}'
-                        count = int(res[0])
-                        total_mapped += count
-                        go_assemble_summary.append({
-                            'item': item,
-                            'count': count,
-                            'total_count': total_count,
-                        })
-
-            os.system(f'rm {outdir}/TR{locus}.sam')
-
-        go_assemble_summary.insert(0, {
-            'item': 'All reads Mapped to TRA and TRB',
-            'count': total_mapped,
-            'total_count': total_count
-        })
-
-        os.system(f'rm {outdir}/log')
+        stat_string = 'All reads Mapped to TRA and TRB' 
 
     elif Seqtype == 'BCR':
         loci = ['H', 'L', 'K']
+        stat_string = 'All reads Mapped to IGH, IGL and IGK'
+
+    for locus in loci:
+        cmd = (
+            f'source activate {BRACER_CONDA}; '
+            f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+            f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} '
+            f'-U {clean_fq} '
+            f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1'
+        )
+        os.system(cmd)
+        with open(f'{outdir}/log') as fh:
+            for line in fh:
+                if 'aligned exactly 1 time' in line:
+                    res = re.findall(r"\d+", line)
+                    item = f'Reads mapped to TR{locus}'
+                    count = int(res[0])
+                    total_mapped += count
+                    go_assemble_summary.append({
+                        'item': item,
+                        'count': count,
+                        'total_count': total_count,
+                    })
+
+        os.system(f'rm {outdir}/TR{locus}.sam')
+
+    go_assemble_summary.insert(0, {
+        'item': stat_string,
+        'count': total_mapped,
+        'total_count': total_count
+    })
+
+    os.system(f'rm {outdir}/log')
 
-        total_mapped = 0
-
-        for locus in loci:
-            cmd = (
-                f'source activate {BRACER_CONDA}; '
-                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
-                f'-x /SGRNJ03/randd/zhouxin/software/bracer/resources/{species}/combinatorial_recombinomes/BCR_{locus} '
-                f'-U {clean_fq} '
-                f'-S {outdir}/BR{locus}.sam > {outdir}/log 2>&1'
-            )
-            os.system(cmd)
-            with open(f'{outdir}/log') as fh:
-                for line in fh:
-                    if 'aligned exactly 1 time' in line:
-                        res = re.findall(r"\d+", line)
-                        item = f'Reads mapped to BR{locus}'
-                        count = int(res[0])
-                        total_mapped += count
-                        go_assemble_summary.append({
-                            'item': item,
-                            'count': count,
-                            'total_count': total_count,
-                        })
-            os.system(f'rm {outdir}/BR{locus}.sam')
-            
-        go_assemble_summary.insert(0, {
-            'item': 'All reads Mapped to IGH, IGL and IGK',
-            'count': total_mapped,
-            'total_count': total_count
-        })
-        os.system(f'rm {outdir}/log')
 
     df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count'])
 
-- 
Gitee


From 46863cc82edcb129111d476dde3e74b9f3ac7847 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 11:02:34 +0800
Subject: [PATCH 63/96] plot_vid

---
 celescope/snp/utils/plot_vid.py | 88 +++++++++++++++++++++++++++++++++
 1 file changed, 88 insertions(+)
 create mode 100644 celescope/snp/utils/plot_vid.py

diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py
new file mode 100644
index 00000000..86f489f5
--- /dev/null
+++ b/celescope/snp/utils/plot_vid.py
@@ -0,0 +1,88 @@
+import ast
+import argparse
+import glob
+import os
+
+import pandas as pd
+from plotnine import aes, geom_point, ggplot
+
+from celescope.celescope import ArgFormatter
+import celescope.tools.utils as utils
+
+
+SAMPLE_COL_INDEX = 2
+MATCH_DIR_COL_INDEX = 3
+VID_COL_INDEX = 4
+
+@utils.add_log
+def parse_mapfile(mapfile):
+    sample_vid_dict = {}
+    sample_match_dir_dict = {}
+    df_mapfile = pd.read_csv(mapfile, sep='\t', header=None)
+    def read_row(row):            
+        sample = row[SAMPLE_COL_INDEX]
+        match_dir = row[MATCH_DIR_COL_INDEX]
+        vid_list = [int(vid) for vid in row[VID_COL_INDEX ].strip().split(',')]
+        sample_vid_dict[sample] = vid_list
+        sample_match_dir_dict[sample] = match_dir
+
+    df_mapfile.apply(read_row, axis=1)
+    return sample_vid_dict, sample_match_dir_dict
+
+class Plot_vid():
+    def __init__(self, sample, outdir, vid_list, snp_dir, match_dir):
+        self.sample = sample
+        self.vid_list = vid_list
+
+        # set
+        vid_tsne_file = glob.glob(f'{snp_dir}/08.analysis_snp/*count_tsne.tsv')[0]
+        self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID":ast.literal_eval})
+        match_tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
+        self.df_match_tsne = pd.read_csv( match_tsne_file, sep='\t', index_col=0)
+
+        # out
+        if not os.path.exists(outdir):
+            os.system(f'mkdir -p {outdir}')
+        self.out_prefix = f'{outdir}/{sample}'
+        self.out_plot_file = f'{self.out_prefix}_VID_tsne.png'
+
+    @utils.add_log
+    def plot_vid(self):
+        def set_label(row):
+            for vid in self.vid_list:
+                row["VIDs"] = "wild_type"
+                if vid in row["VID"]:
+                    row["VIDs"] = "mutation"
+                    break
+            return row
+        df = self.df_vid_tsne.apply(set_label, axis=1)
+        barcode_list = df.loc[df["VIDs"]=="mutation",]["barcode"]
+        self.df_match_tsne["VIDs"] = "wild_type"
+        self.df_match_tsne.loc[barcode_list, "VIDs"] = "mutation"
+        plot = ggplot(self.df_match_tsne, aes(x="tSNE_1",y="tSNE_2",color="VIDs")) + geom_point(size=0.2)
+        plot.save(self.out_plot_file)
+
+
+def main():
+    parser = argparse.ArgumentParser(description='plot snp', formatter_class=ArgFormatter)
+    parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True)
+    parser.add_argument("--outdir", help="output dir", default='plot_VID')
+    args = parser.parse_args()
+
+    sample_vid_dict, sample_match_dir_dict = parse_mapfile(args.mapfile)
+    for sample in sample_vid_dict:
+        vid_list = sample_vid_dict[sample]
+        match_dir = sample_match_dir_dict[sample]
+
+        runner = Plot_vid(
+            sample=sample,
+            outdir=args.outdir,
+            vid_list=vid_list,
+            snp_dir=sample,
+            match_dir=match_dir
+        )
+        runner.plot_vid()
+
+
+if __name__ == '__main__':
+    main()
-- 
Gitee


From 13fbbbf809ad870c96cdd5b7304ebd450275b26d Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 13:09:06 +0800
Subject: [PATCH 64/96] scipy 1.5.0

---
 requirements.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/requirements.txt b/requirements.txt
index 42f0e02c..2ee11454 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,6 +1,6 @@
 cutadapt==1.17
 pysam==0.16.0.1
-scipy==1.4.1
+scipy==1.5.0
 numpy==1.19.5
 jinja2>=2.10
 xopen>=0.5.0
-- 
Gitee


From 058009ad7f5a6a3049ac03e25ba5c73494a04219 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 13:36:25 +0800
Subject: [PATCH 65/96] fix

---
 celescope/tools/count.py | 2 +-
 celescope/tools/utils.py | 2 +-
 requirements.txt         | 1 +
 3 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index 26ea2be7..b532f646 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -347,7 +347,7 @@ class Count(Step):
 
         barcodes = df_UMI.index.levels[1].to_series()
         genes.to_csv(f'{matrix_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False)
-        barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t')
+        barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t', header=False)
         mmwrite(f'{matrix_dir}/{MATRIX_FILE_NAME}', mtx)
 
     @utils.add_log
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index e613b0d8..72ecf120 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -427,7 +427,7 @@ def parse_map_col4(mapfile, default_val):
             library_id = tmp[0]
             library_path = tmp[1]
             sample_name = tmp[2]
-            if len(tmp) == 4:
+            if len(tmp) >= 4:
                 col4 = tmp[3]
             else:
                 col4 = default_val
diff --git a/requirements.txt b/requirements.txt
index 2ee11454..03237668 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,3 +8,4 @@ editdistance>=0.5.3
 sklearn==0.0
 plotly==4.14.3
 plotnine==0.8.0
+cython
-- 
Gitee


From 3362569192d61a271d978afaf61991cd4566bd97 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 13:44:20 +0800
Subject: [PATCH 66/96] fix

---
 celescope/tools/multi.py | 11 ++++++-----
 celescope/tools/utils.py |  1 +
 requirements.txt         |  1 +
 3 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index 89e608cc..afdce712 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -5,7 +5,7 @@ import os
 from collections import defaultdict
 
 import celescope
-from celescope.tools.utils import find_assay_init, find_step_module
+import celescope.tools.utils as utils
 from celescope.celescope import ArgFormatter
 
 TOOLS_DIR = os.path.dirname(celescope.tools.__file__)
@@ -15,7 +15,7 @@ class Multi():
 
     def __init__(self, assay):
         self.__ASSAY__ = assay
-        init_module = find_assay_init(assay)
+        init_module = utils.find_assay_init(assay)
         self.__STEPS__ = init_module.__STEPS__
         self.__CONDA__ = os.path.basename(os.environ['CONDA_DEFAULT_ENV'])
         self.__APP__ = 'celescope'
@@ -71,11 +71,12 @@ class Multi():
 
     def step_args(self):
         for step in self.__STEPS__:
-            step_module = find_step_module(self.__ASSAY__, step)
+            step_module = utils.find_step_module(self.__ASSAY__, step)
             func_opts = getattr(step_module, f"get_opts_{step}")
             func_opts(self.parser, sub_program=False)
 
     @staticmethod
+    @utils.add_log
     def parse_map_col4(mapfile, default_val):
         fq_dict = defaultdict(list)
         col4_dict = {}
@@ -86,7 +87,7 @@ class Multi():
                     continue
                 line_split = line.split()
                 library_id, library_path, sample_name = line_split[:3]
-                if len(line_split) == 4:
+                if len(line_split) >= 4:
                     col4 = line_split[3]
                 else:
                     col4 = default_val
@@ -165,7 +166,7 @@ job_end
         self.last_step = step
 
     def parse_step_args(self, step):
-        step_module = find_step_module(self.__ASSAY__, step)
+        step_module = utils.find_step_module(self.__ASSAY__, step)
         func_opts = getattr(step_module, f"get_opts_{step}")
         step_parser = argparse.ArgumentParser(step_module)
         func_opts(step_parser, sub_program=False)
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 72ecf120..c3085586 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -413,6 +413,7 @@ def get_fq(library_id, library_path):
     return fq1, fq2
 
 
+@add_log
 def parse_map_col4(mapfile, default_val):
     fq_dict = defaultdict(list)
     col4_dict = defaultdict(list)
diff --git a/requirements.txt b/requirements.txt
index 03237668..adb9cd90 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -8,4 +8,5 @@ editdistance>=0.5.3
 sklearn==0.0
 plotly==4.14.3
 plotnine==0.8.0
+matplotlib==3.3.0
 cython
-- 
Gitee


From 81a14141f28eb3119e29ecd89f169e8dfe57c163 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 13:52:53 +0800
Subject: [PATCH 67/96] remove pip install celescope

---
 .github/workflows/setup.yml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml
index f35a9637..dedf5c52 100644
--- a/.github/workflows/setup.yml
+++ b/.github/workflows/setup.yml
@@ -34,7 +34,6 @@ jobs:
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
     - name: Install
       run: |
-        pip install celescope
         python setup.py install
         
         # test
-- 
Gitee


From c10e1f8fc491e5a061f22610e4a46c9f3b5376a6 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 13:57:19 +0800
Subject: [PATCH 68/96] fix mutract

---
 celescope/snp/analysis_snp.py    |  2 +-
 celescope/snp/variant_calling.py | 10 +++++++---
 2 files changed, 8 insertions(+), 4 deletions(-)

diff --git a/celescope/snp/analysis_snp.py b/celescope/snp/analysis_snp.py
index 80ecc3a0..6a109073 100755
--- a/celescope/snp/analysis_snp.py
+++ b/celescope/snp/analysis_snp.py
@@ -4,11 +4,11 @@ import subprocess
 
 import pandas as pd
 import pysam
-from mutract.utils import read_CID
 
 import celescope.tools.utils as utils
 from celescope.tools.analysis_mixin import AnalysisMixin
 from celescope.tools.step import Step, s_common
+from celescope.snp.variant_calling import read_CID
 
 
 class Analysis_variant(Step, AnalysisMixin):
diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index d36c83c1..d4fe6df8 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -42,6 +42,12 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
     return df
 
 
+def read_CID(CID_file):
+    df_index = pd.read_csv(CID_file, sep='\t', index_col=0, dtype=object)
+    df_valid = df_index[df_index['valid'] == 'True']
+    return df_index, df_valid
+
+
 class Variant_calling(Step):
     """
     Features
@@ -231,9 +237,7 @@ class Variant_calling(Step):
                 all_res.append(res)
 
     def read_CID(self):
-        df_index = pd.read_csv(self.CID_file, sep='\t', index_col=0, dtype=object)
-        df_valid = df_index[df_index['valid'] == 'True']
-        return df_index, df_valid
+        return read_CID(self.CID_file)
 
 
     @utils.add_log
-- 
Gitee


From 543f915daad90125a42d896417f4a579a16d043b Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 14:00:21 +0800
Subject: [PATCH 69/96] remove unused

---
 celescope/snp/variant_calling.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index d4fe6df8..4d3e7324 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -125,7 +125,6 @@ class Variant_calling(Step):
         for read in samfile:
             try:
                 barcode = read.get_tag('CB')
-                UMI = read.get_tag('UB')
             except KeyError:
                 continue
             if barcode in self.barcodes:
-- 
Gitee


From f0aaf16971f4c9a03cbed484333e556933820c8e Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 16:41:20 +0800
Subject: [PATCH 70/96] Auto chemistry detection failed info

---
 celescope/tools/barcode.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py
index 1c81e3f3..c03b8874 100755
--- a/celescope/tools/barcode.py
+++ b/celescope/tools/barcode.py
@@ -238,7 +238,11 @@ class Chemistry():
             Chemistry.get_chemistry.logger.info(linker_4_dict)
             if valid_linker_type == 0:
                 print(linker_wrong_dict)
-                raise Exception('auto chemistry detection failed!')
+                raise Exception(
+                    'Auto chemistry detection failed! '
+                    'If the sample is from Singleron, ask the technical staff you are connecting with for the chemistry used. '
+                    'You need to use `--chemistry scopeV1` for scopeV1, and `--chemistry auto` should be fine for scopeV2.* '
+                )
             elif valid_linker_type == 1:
                 chemistry = 'scopeV2.1.1'
             elif valid_linker_type < 4:
-- 
Gitee


From 67a38c1653b8219811d0706d789ee9404c00b55e Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 16:59:38 +0800
Subject: [PATCH 71/96] add log file

---
 celescope/tools/multi.py |  3 ++-
 celescope/tools/utils.py | 15 ++++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index afdce712..9fed21b8 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -130,7 +130,8 @@ class Multi():
 
         # mk log dir
         self.logdir = self.args.outdir + '/log'
-        os.system('mkdir -p %s' % (self.logdir))
+        if self.args.mod == 'sjm':
+            os.system('mkdir -p %s' % (self.logdir))
 
         # script init
         self.sjm_cmd = 'log_dir %s\n' % (self.logdir)
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index c3085586..a74388ea 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -30,15 +30,20 @@ def add_log(func):
     '''
     logging start and done.
     '''
-    logging.basicConfig(
-        level=logging.INFO,
-        stream=sys.stdout,
-        format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
-    )
+    logFormatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
+
     module = func.__module__
     name = func.__name__
     logger_name = f'{module}.{name}'
     logger = logging.getLogger(logger_name)
+    logger.setLevel(logging.INFO)
+
+    fileHandler = logging.FileHandler("./celescope_log.txt")
+    fileHandler.setFormatter(logFormatter)
+    logger.addHandler(fileHandler)
+    consoleHandler = logging.StreamHandler(sys.stdout)
+    consoleHandler.setFormatter(logFormatter)
+    logger.addHandler(consoleHandler)
 
     @wraps(func)
     def wrapper(*args, **kwargs):
-- 
Gitee


From af2c041ce802f1f3ca039315feb1200a184aea4e Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Wed, 23 Jun 2021 17:31:06 +0800
Subject: [PATCH 72/96] docs

---
 .gitignore                 |  1 +
 celescope/rna/mkref.py     | 15 ++++++++++++---
 celescope/tag/split_tag.py | 13 ++++++++++---
 docs/manual.md             |  1 +
 docs/rna/mkref.md          |  7 ++++---
 docs/tag/split_tag.md      | 26 ++++++++++++++++++++++++++
 6 files changed, 54 insertions(+), 9 deletions(-)
 create mode 100644 docs/tag/split_tag.md

diff --git a/.gitignore b/.gitignore
index 30d2a341..3907b7e2 100755
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 # test output
+celescope_log.txt
 test_output/
 
 # vscode
diff --git a/celescope/rna/mkref.py b/celescope/rna/mkref.py
index 26a4311c..606547b4 100755
--- a/celescope/rna/mkref.py
+++ b/celescope/rna/mkref.py
@@ -96,11 +96,20 @@ def mkref(args):
 def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
-        parser.add_argument("--fasta", help="Required. Genome fasta file.", required=True)
-        parser.add_argument("--gtf", help="Required. Genome gtf file.", required=True)
+        parser.add_argument(
+            "--fasta", 
+            help="Required. Genome fasta file. Must be relative file path to genomeDir.", 
+            required=True
+        )
+        parser.add_argument(
+            "--gtf", 
+            help="Required. Genome gtf file. Must be relative file path to genomeDir.", 
+            required=True
+        )
         parser.add_argument(
             "--mt_gene_list", 
-            help="""Mitochondria gene list file. It is a plain text file with one gene per line. 
+            help="""Mitochondria gene list file. Must be relative file path to genomeDir.
+It is a plain text file with one gene per line. 
 If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""", 
             default="None"
         )
diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py
index 0cfb5ccb..aa3bb587 100644
--- a/celescope/tag/split_tag.py
+++ b/celescope/tag/split_tag.py
@@ -13,6 +13,13 @@ from celescope.tools.step import Step, s_common
 from celescope.__init__ import HELP_DICT
 
 class Split_tag(Step):
+    """
+    Features
+    - Split scRNA-Seq fastq according to tag assignment.
+
+    Output
+    - `fastq/{tag}_{1,2}.fq` Fastq files of each tag.
+    """
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
@@ -85,12 +92,12 @@ def split_tag(args):
 def get_opts_split_tag(parser, sub_program):
     parser.add_argument(
         "--split_fastq", 
-        help="Split scRNA-Seq fastq file(01.barcode/{sample}_2.fq).",
+        help="If used, will split scRNA-Seq fastq file according to tag assignment.",
         action='store_true',
     )
     if sub_program:
-        parser.add_argument("--umi_tag_file", help="UMI tag file", required=True)
+        parser.add_argument("--umi_tag_file", help="UMI tag file.", required=True)
         parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True)
-        parser.add_argument("--R1_read", help='R1 read path')
+        parser.add_argument("--R1_read", help='R1 read path.')
         s_common(parser)
 
diff --git a/docs/manual.md b/docs/manual.md
index 2412e3fe..e9caf430 100755
--- a/docs/manual.md
+++ b/docs/manual.md
@@ -35,3 +35,4 @@ Currently, CeleScope includes the follwing pipelines:
 - [mapping_tag](tag/mapping_tag.md)
 - [count_tag](tag/count_tag.md)
 - [analysis_tag](tag/analysis_tag.md)
+- [split_tag](tag/split_tag.md)
diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md
index 0ab50b68..c1b3d592 100644
--- a/docs/rna/mkref.md
+++ b/docs/rna/mkref.md
@@ -28,10 +28,11 @@ refflat = Homo_sapiens_ensembl_99.refFlat
 
 `--dry_run` Only write config file and exit.
 
-`--fasta` Required. Genome fasta file.
+`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir.
 
-`--gtf` Required. Genome gtf file.
+`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir.
 
-`--mt_gene_list` Mitochondria gene list file. It is a plain text file with one gene per line. 
+`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir.
+It is a plain text file with one gene per line. 
 If not provided, will use `MT-` and `mt-` to determine mitochondria genes.
 
diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md
new file mode 100644
index 00000000..5a43f7f8
--- /dev/null
+++ b/docs/tag/split_tag.md
@@ -0,0 +1,26 @@
+## Features
+- Split scRNA-Seq fastq according to tag assignment.
+
+## Output
+- `fastq/{tag}_{1,2}.fq` Fastq files of each tag.
+
+
+## Arguments
+`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment.
+
+`--umi_tag_file` UMI tag file.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--R1_read` R1 read path.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
-- 
Gitee


From fd73c585029a0969e18b70e9850154e456c2fa83 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 09:14:15 +0800
Subject: [PATCH 73/96] pep8

---
 celescope/capture_rna/__init__.py             |  2 +-
 celescope/capture_rna/count_capture_rna.py    | 10 +--
 celescope/capture_rna/multi_capture_rna.py    |  9 +--
 celescope/capture_virus/__init__.py           |  6 +-
 .../capture_virus/analysis_capture_virus.py   |  7 +-
 .../capture_virus/count_capture_virus.py      |  2 -
 celescope/capture_virus/mkref.py              | 10 +--
 .../capture_virus/multi_capture_virus.py      |  5 +-
 celescope/capture_virus/otsu.py               |  7 +-
 celescope/capture_virus/test.py               |  6 +-
 celescope/celescope.py                        |  3 +-
 celescope/citeseq/Count_cite.py               | 16 ++--
 celescope/citeseq/__init__.py                 |  2 +-
 celescope/citeseq/analysis_cite.py            |  3 +-
 celescope/citeseq/count_cite.py               |  2 +-
 celescope/citeseq/multi_citeseq.py            |  1 -
 celescope/fusion/count_fusion.py              |  3 +-
 celescope/fusion/mkref.py                     | 16 ++--
 celescope/fusion/multi_fusion.py              |  2 -
 celescope/fusion/star_fusion.py               |  1 -
 celescope/hla/mapping_hla.py                  |  4 +-
 celescope/hla/multi_hla.py                    |  3 +-
 celescope/hla/test_hla.py                     |  2 +-
 celescope/mut/count_mut.py                    |  3 +-
 celescope/mut/mapping_mut.py                  |  2 +-
 celescope/mut/multi_mut.py                    |  2 +-
 celescope/rna/__init__.py                     | 10 +--
 celescope/rna/analysis.py                     | 16 ++--
 celescope/rna/mkref.py                        | 22 +++---
 celescope/rna/multi_rna.py                    |  2 +
 celescope/rna/star.py                         | 13 ++--
 celescope/rna_virus/__init__.py               |  2 +-
 celescope/rna_virus/analysis_rna_virus.py     |  1 -
 celescope/rna_virus/count_virus.py            |  1 -
 celescope/rna_virus/multi_rna_virus.py        |  3 +-
 celescope/rna_virus/star_virus.py             |  9 +--
 celescope/snp/__init__.py                     |  2 +-
 celescope/snp/analysis_snp.py                 | 25 +++----
 celescope/snp/mkref.py                        |  1 -
 celescope/snp/multi_snp.py                    |  5 +-
 celescope/snp/tests/test_variant_calling.py   | 23 +++---
 celescope/snp/utils/plot_vid.py               | 15 ++--
 celescope/snp/variant_calling.py              | 74 +++++++++----------
 celescope/tag/analysis_tag.py                 |  4 +-
 celescope/tag/count_tag.py                    | 38 +++++-----
 celescope/tag/mapping_tag.py                  | 16 ++--
 celescope/tag/multi_tag.py                    |  3 +-
 celescope/tag/split_tag.py                    | 11 +--
 celescope/tag/tests.py                        |  2 +-
 celescope/tcr_fl/__init__.py                  |  2 +-
 celescope/tcr_fl/assemble.py                  |  1 +
 celescope/tcr_fl/barcode_index.py             |  3 -
 celescope/tcr_fl/multi_tcr_fl.py              |  1 +
 celescope/tcr_fl/split_fq.py                  |  9 ++-
 celescope/tests/conftest.py                   |  2 +-
 celescope/tests/test_function.py              |  6 +-
 celescope/tests/test_multi.py                 |  2 +-
 celescope/tools/analysis_mixin.py             | 13 ++--
 celescope/tools/barcode.py                    | 40 +++++-----
 celescope/tools/cellranger3/cell_calling_3.py | 61 +++++++--------
 .../tools/cellranger3/get_plot_elements.py    | 14 ++--
 celescope/tools/cellranger3/sgt.py            |  9 ++-
 celescope/tools/cellranger3/stats.py          | 27 +++----
 celescope/tools/consensus.py                  | 16 ++--
 celescope/tools/count.py                      | 11 +--
 celescope/tools/cutadapt.py                   | 21 +++---
 celescope/tools/debug.py                      |  3 -
 celescope/tools/featureCounts.py              |  6 +-
 celescope/tools/mkref.py                      |  2 +-
 celescope/tools/multi.py                      | 17 ++---
 celescope/tools/report.py                     |  4 +-
 celescope/tools/sample.py                     |  6 +-
 celescope/tools/star_mixin.py                 | 25 ++++---
 celescope/tools/step.py                       | 20 ++---
 celescope/tools/target_metrics.py             |  5 +-
 celescope/tools/tests.py                      |  9 ++-
 celescope/tools/utils.py                      | 64 ++++++++--------
 celescope/vdj/__init__.py                     |  2 +-
 celescope/vdj/mapping_vdj.py                  | 15 ++--
 celescope/vdj/multi_vdj.py                    |  4 +-
 80 files changed, 407 insertions(+), 440 deletions(-)

diff --git a/celescope/capture_rna/__init__.py b/celescope/capture_rna/__init__.py
index 9197e1fc..0f9e4577 100755
--- a/celescope/capture_rna/__init__.py
+++ b/celescope/capture_rna/__init__.py
@@ -12,4 +12,4 @@ __STEPS__ = [
 IMPORT_DICT = {
     'star': 'celescope.rna',
     'analysis': 'celescope.rna',
-}
\ No newline at end of file
+}
diff --git a/celescope/capture_rna/count_capture_rna.py b/celescope/capture_rna/count_capture_rna.py
index eb54a58f..05062587 100755
--- a/celescope/capture_rna/count_capture_rna.py
+++ b/celescope/capture_rna/count_capture_rna.py
@@ -10,7 +10,7 @@ from celescope.tools.count import Count, get_opts_count
 
 
 class Count_capture_rna(Count):
-    
+
     def bam2table(self):
         """
         read probe file
@@ -56,7 +56,7 @@ class Count_capture_rna(Count):
                 read_count = 0
                 for barcode in probe_gene_count_dict[probe][geneName]:
                     for umi in probe_gene_count_dict[probe][geneName][barcode]:
-                        umi_count += len( probe_gene_count_dict[probe][geneName][barcode])
+                        umi_count += len(probe_gene_count_dict[probe][geneName][barcode])
                         read_count += probe_gene_count_dict[probe][geneName][barcode][umi]
                 row_list.append({
                     'probe': probe,
@@ -67,13 +67,12 @@ class Count_capture_rna(Count):
                 })
 
         df_probe = pd.DataFrame(row_list,
-            columns=['probe', 'gene', 'barcode_count', 'read_count', 'UMI_count'])
+                                columns=['probe', 'gene', 'barcode_count', 'read_count', 'UMI_count'])
         df_probe = df_probe.groupby(['probe']).apply(
             lambda x: x.sort_values('UMI_count', ascending=False)
         )
         return df_probe
 
-
     def run(self):
         df_probe = self.bam2table()
         df_probe.to_csv(f'{self.outdir}/{self.sample}_probe_gene_count.tsv', sep='\t', index=False)
@@ -111,7 +110,6 @@ class Count_capture_rna(Count):
         self.clean_up()
 
 
-
 @utils.add_log
 def count_capture_rna(args):
     # TODO!
@@ -122,4 +120,4 @@ def count_capture_rna(args):
 
 
 def get_opts_count_capture_rna(parser, sub_program):
-    get_opts_count(parser, sub_program)
\ No newline at end of file
+    get_opts_count(parser, sub_program)
diff --git a/celescope/capture_rna/multi_capture_rna.py b/celescope/capture_rna/multi_capture_rna.py
index 4de4f671..948e3b02 100755
--- a/celescope/capture_rna/multi_capture_rna.py
+++ b/celescope/capture_rna/multi_capture_rna.py
@@ -3,7 +3,7 @@ from celescope.tools.multi import Multi
 
 
 class Multi_capture_rna(Multi):
-    
+
     def count_capture_rna(self, sample):
         step = 'count_capture_rna'
         cmd_line = self.get_cmd_line(step, sample)
@@ -14,7 +14,7 @@ class Multi_capture_rna(Multi):
             f'--match_dir {self.col4_dict[sample]} '
         )
         self.process_cmd(cmd, step, sample, m=10, x=1)
-    
+
     def analysis(self, sample):
         step = 'analysis'
         cmd_line = self.get_cmd_line(step, sample)
@@ -30,9 +30,6 @@ def main():
     multi = Multi_capture_rna(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
-
-
-
-
diff --git a/celescope/capture_virus/__init__.py b/celescope/capture_virus/__init__.py
index 8b7b9b02..8efe430c 100755
--- a/celescope/capture_virus/__init__.py
+++ b/celescope/capture_virus/__init__.py
@@ -1,5 +1,5 @@
-__STEPS__ = [  
-    'mkref',  
+__STEPS__ = [
+    'mkref',
     'sample',
     'barcode',
     'cutadapt',
@@ -12,4 +12,4 @@ __ASSAY__ = 'capture_virus'
 
 IMPORT_DICT = {
     'star_virus': 'celescope.rna_virus',
-}
\ No newline at end of file
+}
diff --git a/celescope/capture_virus/analysis_capture_virus.py b/celescope/capture_virus/analysis_capture_virus.py
index f840e01e..0a9bec2a 100755
--- a/celescope/capture_virus/analysis_capture_virus.py
+++ b/celescope/capture_virus/analysis_capture_virus.py
@@ -19,9 +19,10 @@ def analysis_capture_virus(args):
     runner = Analysis_capture_virus(args, step_name)
     runner.run()
 
+
 def get_opts_analysis_capture_virus(parser, sub_program):
-    parser.add_argument("--umi_threshold", help='method to find virus UMI threshold', 
-        choices=['otsu', 'none'], default='otsu')
+    parser.add_argument("--umi_threshold", help='method to find virus UMI threshold',
+                        choices=['otsu', 'none'], default='otsu')
     if sub_program:
         s_common(parser)
         parser.add_argument('--match_dir', help='match_dir', required=True)
@@ -58,7 +59,6 @@ class Analysis_capture_virus(Step, AnalysisMixin):
         self.add_data_item(table_dict=table_dict)
         self.clean_up()
 
-
     def get_virus_tsne(self, virus_df):
         virus_tsne_df = pd.merge(self.tsne_df, virus_df, on="barcode", how="left")
         virus_tsne_df.to_csv(self.virus_tsne_file, sep='\t')
@@ -84,4 +84,3 @@ class Analysis_capture_virus(Step, AnalysisMixin):
         )
         df_thresh = df_virus[df_virus["UMI"] >= threshold]
         df_thresh.to_csv(self.otsu_virus_file, sep='\t')
-
diff --git a/celescope/capture_virus/count_capture_virus.py b/celescope/capture_virus/count_capture_virus.py
index 74bd1b48..531fe481 100755
--- a/celescope/capture_virus/count_capture_virus.py
+++ b/celescope/capture_virus/count_capture_virus.py
@@ -46,7 +46,6 @@ def sum_virus(validated_barcodes, virus_bam,
 @add_log
 def count_capture_virus(args):
 
-
     # 检查和创建输出目录
     if not os.path.exists(args.outdir):
         os.system('mkdir -p %s' % (args.outdir))
@@ -71,4 +70,3 @@ def get_opts_count_capture_virus(parser, sub_program):
         s_common(parser)
         parser.add_argument('--match_dir', help='matched rna_virus directory', required=True)
         parser.add_argument('--virus_bam', required=True)
-
diff --git a/celescope/capture_virus/mkref.py b/celescope/capture_virus/mkref.py
index fdd07bd0..eab4ba53 100755
--- a/celescope/capture_virus/mkref.py
+++ b/celescope/capture_virus/mkref.py
@@ -8,8 +8,8 @@ from celescope.tools.mkref import parse_genomeDir
 
 
 def parse_genomeDir_virus(genomeDir):
-    return parse_genomeDir(genomeDir, entrys = ('fasta',))    
-    
+    return parse_genomeDir(genomeDir, entrys=('fasta',))
+
 
 class Mkref_virus(Mkref):
     def __init__(self, genome_type, args):
@@ -41,7 +41,7 @@ class Mkref_virus(Mkref):
         genome['genomeSAindexNbases'] = self.genomeSAindexNbases
         with open(self.config_file, 'w') as config_handle:
             config.write(config_handle)
-    
+
     def run(self):
         if not self.dry_run:
             self.build_star_index()
@@ -57,5 +57,5 @@ def mkref(args):
 def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
-        parser.add_argument("--fasta", help="virus fasta file",required=True)
-        parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4)
\ No newline at end of file
+        parser.add_argument("--fasta", help="virus fasta file", required=True)
+        parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4)
diff --git a/celescope/capture_virus/multi_capture_virus.py b/celescope/capture_virus/multi_capture_virus.py
index 8c5019a8..faf71f06 100755
--- a/celescope/capture_virus/multi_capture_virus.py
+++ b/celescope/capture_virus/multi_capture_virus.py
@@ -29,7 +29,7 @@ class Multi_capture_virus(Multi):
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
-    def analysis_capture_virus(self, sample):        
+    def analysis_capture_virus(self, sample):
         step = 'analysis_capture_virus'
         cmd_line = self.get_cmd_line(step, sample)
         virus_file = f'{self.outdir_dic[sample]["count_capture_virus"]}/{sample}_virus_UMI_count.tsv'
@@ -48,6 +48,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
-
-
diff --git a/celescope/capture_virus/otsu.py b/celescope/capture_virus/otsu.py
index 8f65a4dc..5a555a55 100755
--- a/celescope/capture_virus/otsu.py
+++ b/celescope/capture_virus/otsu.py
@@ -1,8 +1,8 @@
+import matplotlib.pyplot as plt
 import matplotlib
 import numpy as np
 
 matplotlib.use('Agg')
-import matplotlib.pyplot as plt
 
 
 def threshold_otsu(hist):
@@ -44,8 +44,8 @@ def threshold_otsu(hist):
 
 
 def array2hist(array, binWidth=0.2):
-    counts,bins = np.histogram(array, bins=np.arange(0,max(array)+binWidth,binWidth))
-    return counts,bins
+    counts, bins = np.histogram(array, bins=np.arange(0, max(array)+binWidth, binWidth))
+    return counts, bins
 
 
 def makePlot(hist, thresh, fname):
@@ -54,4 +54,3 @@ def makePlot(hist, thresh, fname):
     plt.axvline(thresh, color='r')
     plt.savefig(fname)
     plt.close()
-
diff --git a/celescope/capture_virus/test.py b/celescope/capture_virus/test.py
index ac7ba8e4..84f974f5 100755
--- a/celescope/capture_virus/test.py
+++ b/celescope/capture_virus/test.py
@@ -13,9 +13,9 @@ class test_capture(unittest.TestCase):
 
     def test_otsu(self):
         count_files = [
-'/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv',
-'/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_Beads_Manual_KZ/04.count_capture_virus/virus_test3_R_A_Beads_Manual_KZ_virus_UMI_count.tsv',
-'/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_3Mins_Manual_KZ/04.count_capture_virus/virus_test3_R_A_3Mins_Manual_KZ_virus_UMI_count.tsv'
+            '/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv',
+            '/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_Beads_Manual_KZ/04.count_capture_virus/virus_test3_R_A_Beads_Manual_KZ_virus_UMI_count.tsv',
+            '/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_3Mins_Manual_KZ/04.count_capture_virus/virus_test3_R_A_3Mins_Manual_KZ_virus_UMI_count.tsv'
         ]
         count_file = '/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv'
         for count_file in count_files:
diff --git a/celescope/celescope.py b/celescope/celescope.py
index 14b3b568..3a8d6308 100755
--- a/celescope/celescope.py
+++ b/celescope/celescope.py
@@ -7,10 +7,11 @@ from celescope.__init__ import __VERSION__, ASSAY_DICT
 class ArgFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter):
     pass
 
+
 def main():
     """celescope cli
     """
-    parser = argparse.ArgumentParser(description='CeleScope',formatter_class=ArgFormatter)
+    parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter)
     parser.add_argument('-v', '--version', action='version', version=__VERSION__)
     subparsers = parser.add_subparsers()
 
diff --git a/celescope/citeseq/Count_cite.py b/celescope/citeseq/Count_cite.py
index 7c410407..d7710829 100755
--- a/celescope/citeseq/Count_cite.py
+++ b/celescope/citeseq/Count_cite.py
@@ -17,7 +17,7 @@ class Count_cite():
         assay,
         read_count_file,
         match_dir,
-        ):
+    ):
         self.sample = sample
         self.outdir = outdir
         self.assay = assay
@@ -26,7 +26,7 @@ class Count_cite():
         self.match_barcode, self.cell_total = read_barcode_file(match_dir)
         self.df_read_count = pd.read_csv(read_count_file, sep="\t", index_col=0)
         self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
-        
+
         if not os.path.exists(outdir):
             os.system('mkdir -p %s' % outdir)
 
@@ -88,9 +88,9 @@ class Count_cite():
 
         self.stats.to_csv(self.stat_file, sep=':', header=False)
         t = reporter(
-        name='count_cite', 
-        assay=self.assay, 
-        sample=self.sample,
-        stat_file=self.stat_file, 
-        outdir=self.outdir + '/..')
-        t.get_report()
\ No newline at end of file
+            name='count_cite',
+            assay=self.assay,
+            sample=self.sample,
+            stat_file=self.stat_file,
+            outdir=self.outdir + '/..')
+        t.get_report()
diff --git a/celescope/citeseq/__init__.py b/celescope/citeseq/__init__.py
index d73510e3..86a5145c 100755
--- a/celescope/citeseq/__init__.py
+++ b/celescope/citeseq/__init__.py
@@ -3,4 +3,4 @@ __ASSAY__ = 'citeseq'
 
 IMPORT_DICT = {
     'mapping_tag': 'celescope.tag'
-}
\ No newline at end of file
+}
diff --git a/celescope/citeseq/analysis_cite.py b/celescope/citeseq/analysis_cite.py
index 2c5f7403..8f061723 100755
--- a/celescope/citeseq/analysis_cite.py
+++ b/celescope/citeseq/analysis_cite.py
@@ -18,7 +18,7 @@ def analysis_cite(args):
 
     if not os.path.exists(args.outdir):
         os.system('mkdir -p %s' % args.outdir)
-        
+
     rds = parse_match_dir(args.match_dir)['rds']
     app = CITESEQ_DIR + "/analysis_cite.R"
     cmd = (
@@ -29,4 +29,3 @@ def analysis_cite(args):
         f'--sample {args.sample} '
     )
     os.system(cmd)
-    
\ No newline at end of file
diff --git a/celescope/citeseq/count_cite.py b/celescope/citeseq/count_cite.py
index b0a6486a..d2563202 100755
--- a/celescope/citeseq/count_cite.py
+++ b/celescope/citeseq/count_cite.py
@@ -20,4 +20,4 @@ def count_cite(args):
         args.match_dir,
     )
     count_cite_object.run()
-    count_cite_object.report()
\ No newline at end of file
+    count_cite_object.report()
diff --git a/celescope/citeseq/multi_citeseq.py b/celescope/citeseq/multi_citeseq.py
index b1a64081..0ec65bc9 100755
--- a/celescope/citeseq/multi_citeseq.py
+++ b/celescope/citeseq/multi_citeseq.py
@@ -4,4 +4,3 @@ def main():
 
     # TODO
     pass
-
diff --git a/celescope/fusion/count_fusion.py b/celescope/fusion/count_fusion.py
index ab81745d..b4facae2 100755
--- a/celescope/fusion/count_fusion.py
+++ b/celescope/fusion/count_fusion.py
@@ -115,9 +115,8 @@ class CountFusion(Step):
             os.system(cmd)
             count_fusion.logger.info("plot done.")
 
-
     def run(self):
-        self.count_fusion()        
+        self.count_fusion()
         self.clean_up()
 
 
diff --git a/celescope/fusion/mkref.py b/celescope/fusion/mkref.py
index 0e7c7841..17134c48 100755
--- a/celescope/fusion/mkref.py
+++ b/celescope/fusion/mkref.py
@@ -8,8 +8,8 @@ from celescope.tools.mkref import parse_genomeDir
 
 
 def parse_genomeDir_fusion(genomeDir):
-    return parse_genomeDir(genomeDir, entrys = ('fasta','fusion_pos'))    
-    
+    return parse_genomeDir(genomeDir, entrys=('fasta', 'fusion_pos'))
+
 
 class Mkref_fusion(Mkref):
     def __init__(self, genome_type, args):
@@ -43,7 +43,7 @@ class Mkref_fusion(Mkref):
         genome['genomeSAindexNbases'] = self.genomeSAindexNbases
         with open(self.config_file, 'w') as config_handle:
             config.write(config_handle)
-    
+
     def run(self):
         if not self.dry_run:
             self.build_star_index()
@@ -59,9 +59,9 @@ def mkref(args):
 def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
-        parser.add_argument("--fasta", help="fusion fasta file",required=True)
+        parser.add_argument("--fasta", help="fusion fasta file", required=True)
         parser.add_argument(
-            "--fusion_pos", 
+            "--fusion_pos",
             help="""
 fusion position file. A two column tab-delimited text file with header.
 "pos" is the end postion of the first gene(1-based).
@@ -71,6 +71,6 @@ PML_3\t183
 PML_4\t254  
 PML_5\t326  
 PML_6\t204   
-""",    
-        required=True,)
-        parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4)
\ No newline at end of file
+""",
+            required=True,)
+        parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4)
diff --git a/celescope/fusion/multi_fusion.py b/celescope/fusion/multi_fusion.py
index b82cd4be..b69de656 100755
--- a/celescope/fusion/multi_fusion.py
+++ b/celescope/fusion/multi_fusion.py
@@ -33,5 +33,3 @@ def main():
 
 if __name__ == '__main__':
     main()
-
-
diff --git a/celescope/fusion/star_fusion.py b/celescope/fusion/star_fusion.py
index 54c02d8b..c70494f9 100755
--- a/celescope/fusion/star_fusion.py
+++ b/celescope/fusion/star_fusion.py
@@ -26,4 +26,3 @@ def get_opts_star_fusion(parser, sub_program):
     # will cause `conflicting option string: --genomeDir`
     # parser.add_argument('--genomeDir', help=argparse.SUPPRESS)
     parser.add_argument('--fusion_genomeDir', help='fusion gene STAR index genome directory', required=True)
-
diff --git a/celescope/hla/mapping_hla.py b/celescope/hla/mapping_hla.py
index db08e6ba..bce9c4e1 100755
--- a/celescope/hla/mapping_hla.py
+++ b/celescope/hla/mapping_hla.py
@@ -148,10 +148,10 @@ def hla_typing(index_file, outdir, thread):
 
 @add_log
 def summary(index_file, outdir, sample):
-    
+
     n = 0
     df_valid = read_index(index_file)
-    
+
     for index in df_valid.index:
         try:
             sub_df = pd.read_csv(
diff --git a/celescope/hla/multi_hla.py b/celescope/hla/multi_hla.py
index f5f4cc8b..21e44651 100755
--- a/celescope/hla/multi_hla.py
+++ b/celescope/hla/multi_hla.py
@@ -1,7 +1,8 @@
 
 def main():
-    #TODO
+    # TODO
     pass
 
+
 if __name__ == '__main__':
     main()
diff --git a/celescope/hla/test_hla.py b/celescope/hla/test_hla.py
index 284c856b..a50e3ccc 100755
--- a/celescope/hla/test_hla.py
+++ b/celescope/hla/test_hla.py
@@ -30,7 +30,7 @@ class testHLA(unittest.TestCase):
     def test_read_index(self):
         read_index(self.index_file)
 
-    #@unittest.skip('pass')
+    # @unittest.skip('pass')
     def test_summary(self):
         summary(self.index_file, self.mapping_outdir, self.sample)
 
diff --git a/celescope/mut/count_mut.py b/celescope/mut/count_mut.py
index 094ce94c..aa72abd3 100755
--- a/celescope/mut/count_mut.py
+++ b/celescope/mut/count_mut.py
@@ -50,7 +50,6 @@ def count_mut(args):
     mut_dic = read_mut(mut_file)
     out_prefix = outdir + "/" + sample
 
-
     # tsne
     match_dict = parse_match_dir(match_dir)
     df_tsne = pd.read_csv(match_dict['tsne_coord'], sep="\t", index_col=0)
@@ -142,7 +141,7 @@ def count_mut(args):
             out_insertion_barcode_count_file, sep="\t")
 
         df_tsne_mut = pd.merge(df_tsne, df_insertion_barcode_count,
-                            right_index=True, left_index=True, how="left")
+                               right_index=True, left_index=True, how="left")
         df_tsne_mut.fillna(0, inplace=True)
         df_tsne_mut.to_csv(out_tsne_file, sep="\t")
 
diff --git a/celescope/mut/mapping_mut.py b/celescope/mut/mapping_mut.py
index 14d2cc1d..f260af22 100755
--- a/celescope/mut/mapping_mut.py
+++ b/celescope/mut/mapping_mut.py
@@ -25,4 +25,4 @@ def get_opts_mapping_mut(parser, sub_program):
         help='insertion or deletion STAR indexed genome directory',
         required=True)
     parser.add_argument("--thread", help='STAR thread', default=1)
-    parser.add_argument("--outFilterMatchNmin", help='STAR outFilterMatchNmin', default=35)
\ No newline at end of file
+    parser.add_argument("--outFilterMatchNmin", help='STAR outFilterMatchNmin', default=35)
diff --git a/celescope/mut/multi_mut.py b/celescope/mut/multi_mut.py
index f77d0c01..7fc122f3 100755
--- a/celescope/mut/multi_mut.py
+++ b/celescope/mut/multi_mut.py
@@ -43,6 +43,6 @@ def main():
     multi = Multi_mut(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
-
diff --git a/celescope/rna/__init__.py b/celescope/rna/__init__.py
index c437cef4..d1b579b8 100755
--- a/celescope/rna/__init__.py
+++ b/celescope/rna/__init__.py
@@ -9,11 +9,11 @@ __STEPS__ = [
     'analysis']
 __ASSAY__ = 'rna'
 
-# m: memory 
+# m: memory
 # x: thread
 RESOURCE = {
-    'sample': {'m':1, 'x':1},
-    'barcode': {'m':5, 'x':1},
-    'cutadapt': {'m':5, 'x':1},
-    'star': {'m':30, 'x':1},
+    'sample': {'m': 1, 'x': 1},
+    'barcode': {'m': 5, 'x': 1},
+    'cutadapt': {'m': 5, 'x': 1},
+    'star': {'m': 30, 'x': 1},
 }
diff --git a/celescope/rna/analysis.py b/celescope/rna/analysis.py
index 560b3768..8045e329 100755
--- a/celescope/rna/analysis.py
+++ b/celescope/rna/analysis.py
@@ -35,11 +35,12 @@ class Analysis_rna(Step, AnalysisMixin):
 
     - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
     parameter is provided. The result contains 3 files:
-	- `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
+        - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
     it means that the given marker is not enough to identify the cluster.
-	- `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
-	- `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
+        - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
+        - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
     """
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
         AnalysisMixin.__init__(self, args)
@@ -77,7 +78,7 @@ def get_opts_analysis(parser, sub_program):
     parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True)
     parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.')
     parser.add_argument(
-        '--type_marker_tsv', 
+        '--type_marker_tsv',
         help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
 ```
 cell_type	marker
@@ -94,11 +95,8 @@ LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
     )
     if sub_program:
         parser.add_argument(
-            '--matrix_file', 
-            help='Required. Matrix_10X directory from step count.', 
+            '--matrix_file',
+            help='Required. Matrix_10X directory from step count.',
             required=True,
         )
         parser = s_common(parser)
-
-
-
diff --git a/celescope/rna/mkref.py b/celescope/rna/mkref.py
index 606547b4..6e8ff3ce 100755
--- a/celescope/rna/mkref.py
+++ b/celescope/rna/mkref.py
@@ -7,8 +7,8 @@ from celescope.tools.mkref import get_opts_mkref as opts
 
 
 def parse_genomeDir_rna(genomeDir):
-    return parse_genomeDir(genomeDir, entrys = ('fasta', 'gtf', 'mt_gene_list'))    
-    
+    return parse_genomeDir(genomeDir, entrys=('fasta', 'gtf', 'mt_gene_list'))
+
 
 class Mkref_rna(Mkref):
     """
@@ -32,13 +32,14 @@ class Mkref_rna(Mkref):
     refflat = Homo_sapiens_ensembl_99.refFlat
     ```
     """
+
     def __init__(self, genome_type, args):
         Mkref.__init__(self, genome_type, args)
         self.fasta = args.fasta
         self.gtf = args.gtf
         self.mt_gene_list = args.mt_gene_list
 
-        # out file 
+        # out file
         self.refflat = f'{self.genome_name}.refFlat'
 
     @utils.add_log
@@ -79,7 +80,7 @@ class Mkref_rna(Mkref):
         )
         Mkref_rna.build_refflat.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
-    
+
     @utils.add_log
     def run(self):
         if not self.dry_run:
@@ -87,6 +88,7 @@ class Mkref_rna(Mkref):
             self.build_star_index()
         self.write_config()
 
+
 def mkref(args):
     genome_type = 'rna'
     runner = Mkref_rna(genome_type, args)
@@ -97,19 +99,19 @@ def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
         parser.add_argument(
-            "--fasta", 
-            help="Required. Genome fasta file. Must be relative file path to genomeDir.", 
+            "--fasta",
+            help="Required. Genome fasta file. Must be relative file path to genomeDir.",
             required=True
         )
         parser.add_argument(
-            "--gtf", 
-            help="Required. Genome gtf file. Must be relative file path to genomeDir.", 
+            "--gtf",
+            help="Required. Genome gtf file. Must be relative file path to genomeDir.",
             required=True
         )
         parser.add_argument(
-            "--mt_gene_list", 
+            "--mt_gene_list",
             help="""Mitochondria gene list file. Must be relative file path to genomeDir.
 It is a plain text file with one gene per line. 
-If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""", 
+If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""",
             default="None"
         )
diff --git a/celescope/rna/multi_rna.py b/celescope/rna/multi_rna.py
index 0e995e8b..cbea5d52 100755
--- a/celescope/rna/multi_rna.py
+++ b/celescope/rna/multi_rna.py
@@ -5,9 +5,11 @@ from celescope.tools.multi import Multi
 class Multi_rna(Multi):
     pass
 
+
 def main():
     multi = Multi_rna(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
diff --git a/celescope/rna/star.py b/celescope/rna/star.py
index 3e9e1f4d..9f1512e1 100755
--- a/celescope/rna/star.py
+++ b/celescope/rna/star.py
@@ -67,7 +67,7 @@ class Star_rna(Step, StarMixin):
                     data = picard_log.readline().strip().split('\t')
                     region_dict = dict(zip(header, data))
                     break
-        
+
         total = float(region_dict['PF_ALIGNED_BASES'])
         exonic_regions = int(region_dict['UTR_BASES']) + \
             int(region_dict['CODING_BASES'])
@@ -75,8 +75,8 @@ class Star_rna(Step, StarMixin):
         intergenic_regions = int(region_dict['INTERGENIC_BASES'])
 
         self.add_metric(
-            name='Base Pairs Mapped to Exonic Regions', 
-            value=exonic_regions, 
+            name='Base Pairs Mapped to Exonic Regions',
+            value=exonic_regions,
             total=total,
         )
         self.add_metric(
@@ -86,7 +86,7 @@ class Star_rna(Step, StarMixin):
         )
         self.add_metric(
             name='Base Pairs Mapped to Intergenic Regions',
-            value=intergenic_regions, 
+            value=intergenic_regions,
             total=total,
         )
 
@@ -107,10 +107,9 @@ class Star_rna(Step, StarMixin):
                 )
 
         region_plot = {'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'],
-                'region_values': [exonic_regions, intronic_regions, intergenic_regions]}   
+                       'region_values': [exonic_regions, intronic_regions, intergenic_regions]}
         self.add_content_item("data", STAR_plot=region_plot)
 
-
     @utils.add_log
     def ribo(self):
         # TODO remove bbduk.sh and use picard ribo bases
@@ -159,4 +158,4 @@ def star(args):
 
 
 def get_opts_star(parser, sub_program):
-    get_opts_star_mixin(parser, sub_program)
\ No newline at end of file
+    get_opts_star_mixin(parser, sub_program)
diff --git a/celescope/rna_virus/__init__.py b/celescope/rna_virus/__init__.py
index 0601c4ca..3bf7e213 100755
--- a/celescope/rna_virus/__init__.py
+++ b/celescope/rna_virus/__init__.py
@@ -12,4 +12,4 @@ __STEPS__ = [
 __ASSAY__ = 'rna_virus'
 IMPORT_DICT = {
     'star': 'celescope.rna'
-}
\ No newline at end of file
+}
diff --git a/celescope/rna_virus/analysis_rna_virus.py b/celescope/rna_virus/analysis_rna_virus.py
index 2548e30f..5da51724 100755
--- a/celescope/rna_virus/analysis_rna_virus.py
+++ b/celescope/rna_virus/analysis_rna_virus.py
@@ -124,4 +124,3 @@ def get_opts_analysis_rna_virus(parser, sub_program):
             '--virus_file',
             help='virus UMI count file',
             required=True)
-        
\ No newline at end of file
diff --git a/celescope/rna_virus/count_virus.py b/celescope/rna_virus/count_virus.py
index 46ab2880..1cd9f04e 100755
--- a/celescope/rna_virus/count_virus.py
+++ b/celescope/rna_virus/count_virus.py
@@ -76,4 +76,3 @@ def get_opts_count_virus(parser, sub_program):
         s_common(parser)
         parser.add_argument('--virus_bam', required=True)
         parser.add_argument('--barcode_file', required=True)
-
diff --git a/celescope/rna_virus/multi_rna_virus.py b/celescope/rna_virus/multi_rna_virus.py
index b40181b0..b921353e 100755
--- a/celescope/rna_virus/multi_rna_virus.py
+++ b/celescope/rna_virus/multi_rna_virus.py
@@ -4,7 +4,6 @@ from celescope.tools.multi import Multi
 
 class Multi_rna_virus(Multi):
 
-
     def star_virus(self, sample):
         step = 'star_virus'
         fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}'
@@ -28,7 +27,7 @@ class Multi_rna_virus(Multi):
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
-    def analysis_rna_virus(self, sample):        
+    def analysis_rna_virus(self, sample):
         step = 'analysis_rna_virus'
         virus_file = f'{self.outdir_dic[sample]["count_virus"]}/{sample}_virus_UMI_count.tsv'
         matrix_file = f'{self.outdir_dic[sample]["count"]}/{sample}_matrix.tsv.gz'
diff --git a/celescope/rna_virus/star_virus.py b/celescope/rna_virus/star_virus.py
index 3f6fae9a..905f58b9 100755
--- a/celescope/rna_virus/star_virus.py
+++ b/celescope/rna_virus/star_virus.py
@@ -7,13 +7,13 @@ class StarVirus(Step, StarMixin):
     """
     star virus class
     """
+
     def __init__(self, args, step_name):
-        # add genomeDir 
+        # add genomeDir
         args.genomeDir = args.virus_genomeDir
-        
-        Step.__init__(self, args, step_name)        
-        StarMixin.__init__(self, args, add_prefix='virus')
 
+        Step.__init__(self, args, step_name)
+        StarMixin.__init__(self, args, add_prefix='virus')
 
     def run(self):
         self.run_star()
@@ -30,4 +30,3 @@ def star_virus(args):
 def get_opts_star_virus(parser, sub_program):
     get_opts_star_mixin(parser, sub_program)
     parser.add_argument('--virus_genomeDir', help='virus genome dir', required=True)
-
diff --git a/celescope/snp/__init__.py b/celescope/snp/__init__.py
index c9f17e51..1f9607bf 100755
--- a/celescope/snp/__init__.py
+++ b/celescope/snp/__init__.py
@@ -1,6 +1,6 @@
 __STEPS__ = [
     'mkref',
-    'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts', 
+    'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts',
     'target_metrics', 'variant_calling', 'analysis_snp'
 ]
 __ASSAY__ = 'snp'
diff --git a/celescope/snp/analysis_snp.py b/celescope/snp/analysis_snp.py
index 6a109073..80aee4a1 100755
--- a/celescope/snp/analysis_snp.py
+++ b/celescope/snp/analysis_snp.py
@@ -22,7 +22,7 @@ class Analysis_variant(Step, AnalysisMixin):
         self.annovar_config = args.annovar_config
         self.match_dir = args.match_dir
         self.vcf_GT = None
-        
+
     def get_df_count_tsne(self):
         '''
         output: f'{self.outdir}/{self.sample}_count_tsne.tsv'
@@ -30,8 +30,8 @@ class Analysis_variant(Step, AnalysisMixin):
         df_vc = pd.read_csv(self.variant_count_file, sep='\t')
         df_vc = df_vc[df_vc["alt_count"] > 0]
         df_vc_cell = df_vc.groupby('CID').agg({
-            'alt_count':'count',
-            'VID':list,
+            'alt_count': 'count',
+            'VID': list,
         })
 
         df_CID, _df_valid = read_CID(self.CID_file)
@@ -39,7 +39,7 @@ class Analysis_variant(Step, AnalysisMixin):
         tsne_df_CID = pd.merge(self.tsne_df, df_CID, on='barcode', how='left')
 
         df_vc_barcode = pd.merge(df_vc_cell, df_CID, on='CID')
-        df_vc_barcode_tsne = pd.merge(df_vc_barcode, tsne_df_CID, on=['barcode','CID'], how='right')
+        df_vc_barcode_tsne = pd.merge(df_vc_barcode, tsne_df_CID, on=['barcode', 'CID'], how='right')
         df_vc_barcode_tsne['value'] = df_vc_barcode_tsne['alt_count']
         df_vc_barcode_tsne['value'] = df_vc_barcode_tsne['value'].fillna(0)
         df_vc_barcode_tsne['value'].astype('int32')
@@ -61,7 +61,7 @@ class Analysis_variant(Step, AnalysisMixin):
         text = list(df_count_tsne.apply(return_text, axis=1))
         value = list(df_count_tsne.value)
         title = 't-SNE plot Colored by Cell Variant Counts'
-        count_tsne = {"tSNE_1": tSNE_1, "tSNE_2": tSNE_2, "text": text, 'value':value, 'title':title}
+        count_tsne = {"tSNE_1": tSNE_1, "tSNE_2": tSNE_2, "text": text, 'value': value, 'title': title}
         return count_tsne
 
     def add_GT(self):
@@ -73,28 +73,26 @@ class Analysis_variant(Step, AnalysisMixin):
         out_vcf = pysam.VariantFile(out_vcf_file, 'w', header=vcf.header)
         for rec in vcf:
             for sample in rec.samples:
-                rec.samples[sample]["GT"] = (1,1)
+                rec.samples[sample]["GT"] = (1, 1)
                 out_vcf.write(rec)
         vcf.close()
         out_vcf.close()
         self.vcf_GT = out_vcf_file
 
-
     def get_df_table(self):
-        
-        df_vcf = utils.parse_vcf(self.vcf_GT, infos=['VID','CID'])
+
+        df_vcf = utils.parse_vcf(self.vcf_GT, infos=['VID', 'CID'])
         df_annovar = self.annovar()
         df_vcf = pd.concat((df_vcf, df_annovar), axis=1)
-        df_vcf["nCell"] = df_vcf["CID"].apply(func=lambda row:1 if isinstance(row,str) else len(row))
+        df_vcf["nCell"] = df_vcf["CID"].apply(func=lambda row: 1 if isinstance(row, str) else len(row))
 
         out_df_vcf = f'{self.outdir}/{self.sample}_variant_table.tsv'
         df_vcf.to_csv(out_df_vcf, sep='\t', index=False)
 
-        cols = ['VID','Chrom','Pos','Alleles','Gene','nCell','mRNA','Protein','COSMIC']
+        cols = ['VID', 'Chrom', 'Pos', 'Alleles', 'Gene', 'nCell', 'mRNA', 'Protein', 'COSMIC']
         df_vcf = df_vcf[cols]
         return df_vcf
 
-
     def run(self):
         self.add_GT()
         cluster_tsne = self.get_cluster_tsne(colname='cluster', tsne_df=self.tsne_df)
@@ -157,6 +155,7 @@ def analysis_snp(args):
     step_snp = Analysis_variant(args, step)
     step_snp.run()
 
+
 def get_opts_analysis_snp(parser, sub_program):
     parser.add_argument('--annovar_config', help='annovar soft config file', required=True)
     if sub_program:
@@ -164,4 +163,4 @@ def get_opts_analysis_snp(parser, sub_program):
         parser.add_argument('--match_dir', help='match_dir', required=True)
         parser.add_argument('--vcf', help='vcf file', required=True)
         parser.add_argument('--CID_file', help='CID_file', required=True)
-        parser.add_argument('--variant_count_file', help='variant count file', required=True)
\ No newline at end of file
+        parser.add_argument('--variant_count_file', help='variant count file', required=True)
diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py
index a8e0a006..b5632a9f 100644
--- a/celescope/snp/mkref.py
+++ b/celescope/snp/mkref.py
@@ -66,4 +66,3 @@ def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
         parser.add_argument("--fasta", help="fasta file", required=True)
-
diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index bad75811..69dbc418 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -12,7 +12,7 @@ class Multi_snp(Multi):
         else:
             fq = f'{self.outdir_dic[sample]["consensus"]}/{sample}_consensus.fq'
             cmd_line += ' --consensus_fq '
-   
+
         cmd = (
             f'{cmd_line} '
             f'--fq {fq} '
@@ -30,7 +30,6 @@ class Multi_snp(Multi):
         )
         self.process_cmd(cmd, step, sample, m=2, x=1)
 
-
     def variant_calling(self, sample):
         step = 'variant_calling'
         cmd_line = self.get_cmd_line(step, sample)
@@ -62,6 +61,6 @@ def main():
     multi = Multi_snp(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
-
diff --git a/celescope/snp/tests/test_variant_calling.py b/celescope/snp/tests/test_variant_calling.py
index 5d612b00..cdeb3d55 100644
--- a/celescope/snp/tests/test_variant_calling.py
+++ b/celescope/snp/tests/test_variant_calling.py
@@ -5,21 +5,22 @@ from celescope.snp.variant_calling import Variant_calling
 
 ROOT_DIR = os.path.dirname(__file__)
 
+
 class Test_variant_calling(unittest.TestCase):
     def setUp(self):
         os.chdir(ROOT_DIR)
         Args = namedtuple("Args", "thread outdir sample assay debug " + "genomeDir vcf bam match_dir")
         self.args = Args(
-           thread=10,
-           outdir="./test_output/07.variant_calling",
-           sample="test1",
-           assay="snp",
-           debug=False,
-           genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92",
-           vcf=None,
-           bam="./test_data/06.target_metrics/subset_filter.bam",
-           match_dir="./test_data/match_dir",
-        )   
+            thread=10,
+            outdir="./test_output/07.variant_calling",
+            sample="test1",
+            assay="snp",
+            debug=False,
+            genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92",
+            vcf=None,
+            bam="./test_data/06.target_metrics/subset_filter.bam",
+            match_dir="./test_data/match_dir",
+        )
 
     def test_run(self):
         obj = Variant_calling(self.args, "variant_calling")
@@ -35,4 +36,4 @@ class Test_variant_calling(unittest.TestCase):
         obj.write_VID_file()
         obj.get_UMI()
         obj.write_support_matrix()
-        obj.clean_up()
\ No newline at end of file
+        obj.clean_up()
diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py
index 86f489f5..5211d01a 100644
--- a/celescope/snp/utils/plot_vid.py
+++ b/celescope/snp/utils/plot_vid.py
@@ -14,21 +14,24 @@ SAMPLE_COL_INDEX = 2
 MATCH_DIR_COL_INDEX = 3
 VID_COL_INDEX = 4
 
+
 @utils.add_log
 def parse_mapfile(mapfile):
     sample_vid_dict = {}
     sample_match_dir_dict = {}
     df_mapfile = pd.read_csv(mapfile, sep='\t', header=None)
-    def read_row(row):            
+
+    def read_row(row):
         sample = row[SAMPLE_COL_INDEX]
         match_dir = row[MATCH_DIR_COL_INDEX]
-        vid_list = [int(vid) for vid in row[VID_COL_INDEX ].strip().split(',')]
+        vid_list = [int(vid) for vid in row[VID_COL_INDEX].strip().split(',')]
         sample_vid_dict[sample] = vid_list
         sample_match_dir_dict[sample] = match_dir
 
     df_mapfile.apply(read_row, axis=1)
     return sample_vid_dict, sample_match_dir_dict
 
+
 class Plot_vid():
     def __init__(self, sample, outdir, vid_list, snp_dir, match_dir):
         self.sample = sample
@@ -36,9 +39,9 @@ class Plot_vid():
 
         # set
         vid_tsne_file = glob.glob(f'{snp_dir}/08.analysis_snp/*count_tsne.tsv')[0]
-        self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID":ast.literal_eval})
+        self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID": ast.literal_eval})
         match_tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0]
-        self.df_match_tsne = pd.read_csv( match_tsne_file, sep='\t', index_col=0)
+        self.df_match_tsne = pd.read_csv(match_tsne_file, sep='\t', index_col=0)
 
         # out
         if not os.path.exists(outdir):
@@ -56,10 +59,10 @@ class Plot_vid():
                     break
             return row
         df = self.df_vid_tsne.apply(set_label, axis=1)
-        barcode_list = df.loc[df["VIDs"]=="mutation",]["barcode"]
+        barcode_list = df.loc[df["VIDs"] == "mutation", ]["barcode"]
         self.df_match_tsne["VIDs"] = "wild_type"
         self.df_match_tsne.loc[barcode_list, "VIDs"] = "mutation"
-        plot = ggplot(self.df_match_tsne, aes(x="tSNE_1",y="tSNE_2",color="VIDs")) + geom_point(size=0.2)
+        plot = ggplot(self.df_match_tsne, aes(x="tSNE_1", y="tSNE_2", color="VIDs")) + geom_point(size=0.2)
         plot.save(self.out_plot_file)
 
 
diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py
index 4d3e7324..7429e0c4 100755
--- a/celescope/snp/variant_calling.py
+++ b/celescope/snp/variant_calling.py
@@ -15,7 +15,6 @@ from celescope.tools.step import Step, s_common
 from celescope.rna.mkref import parse_genomeDir_rna
 
 
-
 def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
     '''
     parse vcf into df
@@ -34,11 +33,11 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)):
                 rec_dict['alt'] = '.'
                 if len(rec_dict['alleles']) == 2:
                     rec_dict['alt'] = rec_dict['alleles'][1]
-                
+
         for info in infos:
             rec_dict[info] = rec.info[info]
 
-        df = df.append(pd.Series(rec_dict),ignore_index=True)
+        df = df.append(pd.Series(rec_dict), ignore_index=True)
     return df
 
 
@@ -67,8 +66,8 @@ class Variant_calling(Step):
     2 : all reads/UMIs at the position support the alt allele.  
     3 : one or more reads/UMIs support both the alt and the ref allele.  
     """
-    
-    def __init__(self, args, step_name):    
+
+    def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
         # set
@@ -80,7 +79,7 @@ class Variant_calling(Step):
             self.vcf_bool = False
         self.df_vcf = None
 
-        # out 
+        # out
         self.splitN_bam = f'{self.out_prefix}_splitN.bam'
         self.CID_file = f'{self.out_prefix}_CID.tsv'
         self.VID_file = f'{self.out_prefix}_VID.tsv'
@@ -88,7 +87,6 @@ class Variant_calling(Step):
         self.variant_count_file = f'{self.out_prefix}_variant_count.tsv'
         self.support_matrix_file = f'{self.out_prefix}_support.mtx'
 
-
     @utils.add_log
     def SplitNCigarReads(self):
         cmd = (
@@ -101,7 +99,6 @@ class Variant_calling(Step):
         Variant_calling.SplitNCigarReads.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
 
-
     @utils.add_log
     def split_bam(self):
         '''
@@ -118,7 +115,7 @@ class Variant_calling(Step):
         bam_dict = defaultdict(list)
         CID_dict = defaultdict(dict)
         cells_dir = f'{self.outdir}/cells/'
-    
+
         # read bam and split
         samfile = pysam.AlignmentFile(self.splitN_bam, "rb")
         header = samfile.header
@@ -134,7 +131,6 @@ class Variant_calling(Step):
                 # assign read to barcode
                 bam_dict[barcode].append(read)
 
-
         self.split_bam.logger.info('writing cell bam...')
         # write new bam
         CID = 0
@@ -174,7 +170,7 @@ class Variant_calling(Step):
             f'samtools sort {bam} -o {sorted_bam}'
         )
         subprocess.check_call(cmd_sort, shell=True)
-    
+
         # mpileup
         bcf = f'{outdir}/cells/cell{CID}/cell{CID}.bcf'
         cmd_mpileup = (
@@ -238,7 +234,6 @@ class Variant_calling(Step):
     def read_CID(self):
         return read_CID(self.CID_file)
 
-
     @utils.add_log
     def merge_vcf(self):
         '''
@@ -256,7 +251,7 @@ class Variant_calling(Step):
         for CID in CIDs:
             CID = str(CID)
             vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
-            vcf = pysam.VariantFile(vcf_file,'r')
+            vcf = pysam.VariantFile(vcf_file, 'r')
             for rec in vcf.fetch():
                 v = ','.join([str(getattr(rec, col)) for col in v_cols])
                 if not v in v_dict:
@@ -270,12 +265,12 @@ class Variant_calling(Step):
         def get_vcf_header(CIDs):
             CID = CIDs[0]
             vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf'
-            vcf = pysam.VariantFile(vcf_file,'r')
+            vcf = pysam.VariantFile(vcf_file, 'r')
             return vcf.header
         vcf_header = get_vcf_header(CIDs)
         vcf_header.info.add('VID', number=1, type='String', description='Variant ID')
         vcf_header.info.add('CID', number=1, type='String', description='Cell ID')
-        merged_vcf = pysam.VariantFile(self.final_vcf_file,'w', header=vcf_header)
+        merged_vcf = pysam.VariantFile(self.final_vcf_file, 'w', header=vcf_header)
 
         VID = 0
         for v in sorted(v_dict.keys()):
@@ -285,7 +280,7 @@ class Variant_calling(Step):
             record = merged_vcf.new_record()
             cols = ['chrom', 'pos', 'alleles']
             for col in cols:
-                setattr(record,col, getattr(rec,col))
+                setattr(record, col, getattr(rec, col))
             record.info['VID'] = str(VID)
             record.info['CID'] = CID
             merged_vcf.write(record)
@@ -294,12 +289,12 @@ class Variant_calling(Step):
     @utils.add_log
     def write_VID_file(self):
         df_vcf = parse_vcf(self.final_vcf_file)
-        df_VID = df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']]
+        df_VID = df_vcf.loc[:, ['VID', 'chrom', 'pos', 'ref', 'alt']]
         df_VID.to_csv(self.VID_file, sep='\t', index=False)
 
     @utils.add_log
     def add_VID(self):
-        vcf = pysam.VariantFile(self.args.vcf,'r')
+        vcf = pysam.VariantFile(self.args.vcf, 'r')
         vcf_header = vcf.header
         if 'VID' in vcf_header.info:
             logging.info('VID is already in vcf file!')
@@ -309,7 +304,7 @@ class Variant_calling(Step):
         VID = 0
         for rec in vcf.fetch():
             VID += 1
-            rec.info['VID'] = str(VID) 
+            rec.info['VID'] = str(VID)
             VID_vcf.write(rec)
         VID_vcf.close()
 
@@ -323,9 +318,9 @@ class Variant_calling(Step):
         def get_DP4(row, alt):
             DP4 = row['DP4'].iloc[0]
             if alt == 'ref':
-                indexs = [0,1]
+                indexs = [0, 1]
             elif alt == 'alt':
-                indexs = [2,3]
+                indexs = [2, 3]
             umi = sum([DP4[index] for index in indexs])
             return umi
 
@@ -333,9 +328,9 @@ class Variant_calling(Step):
             pos = row['pos']
             chrom = row['chrom']
             alt = row['alt']
-            df_pos = df_cell_vcf[(df_cell_vcf['pos']==pos) & (df_cell_vcf['chrom']==chrom)]
-            df_ref = df_pos[df_pos['alt']=='.']
-            df_alt = df_pos[df_pos['alt']==alt]
+            df_pos = df_cell_vcf[(df_cell_vcf['pos'] == pos) & (df_cell_vcf['chrom'] == chrom)]
+            df_ref = df_pos[df_pos['alt'] == '.']
+            df_alt = df_pos[df_pos['alt'] == alt]
             ref_UMI = 0
             alt_UMI = 0
             if df_ref.shape[0] != 0:
@@ -345,15 +340,15 @@ class Variant_calling(Step):
             return ref_UMI, alt_UMI, pos, chrom, alt
 
         for index in df_vcf.index:
-            row = df_vcf.loc[index,]
+            row = df_vcf.loc[index, ]
             ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf)
             if (ref_UMI + alt_UMI) != 0:
                 VID = row['VID']
                 dic = {
-                    'VID':VID,
-                    'CID':CID,
-                    'ref_count':ref_UMI, 
-                    'alt_count':alt_UMI,
+                    'VID': VID,
+                    'CID': CID,
+                    'ref_count': ref_UMI,
+                    'alt_count': alt_UMI,
                 }
                 df_UMI = df_UMI.append(dic, ignore_index=True)
         return df_UMI
@@ -363,7 +358,7 @@ class Variant_calling(Step):
         '''
         get variant and ref UMI supporting an allele
         '''
-        _df_index, df_valid =  self.read_CID()
+        _df_index, df_valid = self.read_CID()
 
         df_UMI_list = []
         CID_arg = list(df_valid.index)
@@ -372,12 +367,12 @@ class Variant_calling(Step):
         with ProcessPoolExecutor(self.thread) as pool:
             for res in pool.map(Variant_calling.cell_UMI, CID_arg, outdir_arg, final_vcf_file_arg):
                 df_UMI_list.append(res)
-        
+
         df_UMI = pd.concat(df_UMI_list)
         df_UMI['VID'] = df_UMI['VID'].astype('int')
-        df_UMI.sort_values(by=['VID','CID'], inplace=True)
+        df_UMI.sort_values(by=['VID', 'CID'], inplace=True)
         df_UMI.to_csv(self.variant_count_file, sep='\t', index=False)
-    
+
     @utils.add_log
     def write_support_matrix(self):
         def set_support_bit(row):
@@ -390,9 +385,8 @@ class Variant_calling(Step):
         df_variant_count['support'] = df_variant_count.apply(set_support_bit, axis=1)
         support_mtx = coo_matrix(
             (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1))
-        )       
+        )
         mmwrite(self.support_matrix_file, support_mtx)
-                    
 
     def run(self):
         self.SplitNCigarReads()
@@ -420,20 +414,20 @@ def get_opts_variant_calling(parser, sub_program):
 
     parser.add_argument("--genomeDir", help=HELP_DICT['genomeDir'], required=True)
     parser.add_argument(
-        "--vcf", 
+        "--vcf",
         help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
-and use these variants as input vcf.""", 
+and use these variants as input vcf.""",
         required=False
     )
     if sub_program:
         parser.add_argument(
             "--bam",
-            help='Input BAM file from step `target_metrics`. ', 
+            help='Input BAM file from step `target_metrics`. ',
             required=True
         )
         parser.add_argument(
-            "--match_dir", 
-            help=HELP_DICT['match_dir'], 
+            "--match_dir",
+            help=HELP_DICT['match_dir'],
             required=True
         )
         s_common(parser)
diff --git a/celescope/tag/analysis_tag.py b/celescope/tag/analysis_tag.py
index 4209efff..5c04c5fb 100755
--- a/celescope/tag/analysis_tag.py
+++ b/celescope/tag/analysis_tag.py
@@ -10,6 +10,7 @@ class Analysis_tag(Step, AnalysisMixin):
     Features
     - Combine scRNA-Seq clustering infromation with tag assignment.
     """
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
         AnalysisMixin.__init__(self, args)
@@ -31,8 +32,9 @@ def get_opts_analysis_tag(parser, sub_program):
         parser.add_argument("--match_dir", help="Match celescope scRNA-Seq directory. ", required=True)
         parser = s_common(parser)
 
+
 @utils.add_log
 def analysis_tag(args):
     step_name = 'analysis_tag'
     ana = Analysis_tag(args, step_name)
-    ana.run()
\ No newline at end of file
+    ana.run()
diff --git a/celescope/tag/count_tag.py b/celescope/tag/count_tag.py
index 52cc42b9..6d9121a2 100755
--- a/celescope/tag/count_tag.py
+++ b/celescope/tag/count_tag.py
@@ -2,44 +2,43 @@
 assign cell identity based on SNR and UMI_min
 """
 
+from celescope.__init__ import ROOT_PATH
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
 import subprocess
 
 import matplotlib
 
 matplotlib.use('Agg')
-import matplotlib.pyplot as plt
-import numpy as np
-import pandas as pd
-
-import celescope.tools.utils as utils
-from celescope.tools.step import Step, s_common
-from celescope.__init__ import ROOT_PATH
 
 
 def get_opts_count_tag(parser, sub_program):
     parser.add_argument(
         "--UMI_min",
-        help="Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.", 
+        help="Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.",
         default="auto"
     )
     parser.add_argument(
-        "--dim", 
-        help="Default=1. Tag dimentions. Usually we use 1-dimentional tag.", 
+        "--dim",
+        help="Default=1. Tag dimentions. Usually we use 1-dimentional tag.",
         default=1
     )
     parser.add_argument(
         "--SNR_min",
         help="""Default='auto'. Minimum signal-to-noise ratio. 
-Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. """, 
+Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. """,
         default="auto"
     )
     parser.add_argument("--combine_cluster",
-        help="Conbine cluster tsv file.", default=None)
+                        help="Conbine cluster tsv file.", default=None)
     parser.add_argument(
-        "--coefficient", 
+        "--coefficient",
         help="""Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as 
 `SNR_min = max(median(SNRs) * coefficient, 2)`. 
-Smaller `coefficient` will cause less *multiplet* in the tag assignment.""", 
+Smaller `coefficient` will cause less *multiplet* in the tag assignment.""",
         default=0.1
     )
     if sub_program:
@@ -47,6 +46,7 @@ Smaller `coefficient` will cause less *multiplet* in the tag assignment.""",
         parser.add_argument("--match_dir", help="Match celescope scRNA-Seq directory.", required=True)
         s_common(parser)
 
+
 def count_tag(args):
 
     step_name = "count_tag"
@@ -85,11 +85,11 @@ class Count_tag(Step):
 
         # read
         self.df_read_count = pd.read_csv(self.read_count_file, sep="\t", index_col=0)
-        
+
         match_dict = utils.parse_match_dir(self.match_dir)
-        self.match_barcode = match_dict['match_barcode'] 
+        self.match_barcode = match_dict['match_barcode']
         self.cell_total = match_dict['cell_total']
-        self.tsne_file =  match_dict['tsne_coord']
+        self.tsne_file = match_dict['tsne_coord']
         self.matrix_dir = match_dict['matrix_dir']
 
         # init
@@ -163,7 +163,6 @@ class Count_tag(Step):
         signal_tags_str = "_".join(signal_tags)
         return signal_tags_str
 
-
     def write_and_plot(self, df, column_name, count_file, plot_file):
         df_count = df.groupby(["tag", column_name]).size().unstack()
         df_count.fillna(0, inplace=True)
@@ -278,7 +277,7 @@ class Count_tag(Step):
                 plot_file=self.combine_cluster_plot
             )
 
-        sr_tag_count = df_UMI_cell["tag"].value_counts() # series(index:tag name, value:tag count)
+        sr_tag_count = df_UMI_cell["tag"].value_counts()  # series(index:tag name, value:tag count)
         for tag_name in ("Undetermined", "Multiplet"):
             self.add_metric(
                 name=tag_name + ' Cells',
@@ -311,4 +310,3 @@ class Count_tag(Step):
         )
         Count_tag.seurat_hashtag.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
-
diff --git a/celescope/tag/mapping_tag.py b/celescope/tag/mapping_tag.py
index 6cd44423..290c0356 100755
--- a/celescope/tag/mapping_tag.py
+++ b/celescope/tag/mapping_tag.py
@@ -12,15 +12,15 @@ from celescope.tools.step import Step, s_common
 
 def get_opts_mapping_tag(parser, sub_program):
     parser.add_argument(
-        "--fq_pattern", 
+        "--fq_pattern",
         help="""Required. R2 read pattern. The number after the letter represents the number of bases.         
 `L` linker(common sequences)  
 `C` tag barcode  
-""", 
+""",
         required=True
     )
     parser.add_argument(
-        "--barcode_fasta", 
+        "--barcode_fasta",
         help="""Required. Tag barcode fasta file. It will check the mismatches between tag barcode 
 sequence in R2 reads with all tag barcode sequence in barcode_fasta. 
 It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. 
@@ -35,11 +35,11 @@ AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
 >tag_3
 CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
 ```  
-""", 
+""",
         required=True,
     )
     parser.add_argument(
-        "--linker_fasta", 
+        "--linker_fasta",
         help="""Optional. If provided, it will check the mismatches between linker sequence in R2 reads 
 with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid.
 """,
@@ -134,8 +134,8 @@ class Mapping_tag(Step):
                         if miss_length > 2:
                             reads_unmapped_too_short += 1
                             continue
-                        seq_barcode = seq_barcode + "A" * miss_length                    
-                
+                        seq_barcode = seq_barcode + "A" * miss_length
+
                 # check linker
                 if self.linker_length != 0:
                     valid_linker = False
@@ -145,7 +145,7 @@ class Mapping_tag(Step):
                             break
                 else:
                     valid_linker = True
-                    
+
                 if not valid_linker:
                     reads_unmapped_invalid_iinker += 1
                     continue
diff --git a/celescope/tag/multi_tag.py b/celescope/tag/multi_tag.py
index 0686fc6b..23d0e9b1 100755
--- a/celescope/tag/multi_tag.py
+++ b/celescope/tag/multi_tag.py
@@ -25,7 +25,6 @@ class Multi_tag(Multi):
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
-
     def analysis_tag(self, sample):
         step = 'analysis_tag'
         tsne_tag_file = f'{self.outdir_dic[sample]["count_tag"]}/{sample}_tsne_tag.tsv'
@@ -49,10 +48,10 @@ class Multi_tag(Multi):
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
 
-
 def main():
     multi = Multi_tag(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py
index aa3bb587..53b2a732 100644
--- a/celescope/tag/split_tag.py
+++ b/celescope/tag/split_tag.py
@@ -12,6 +12,7 @@ import celescope.tools.utils as utils
 from celescope.tools.step import Step, s_common
 from celescope.__init__ import HELP_DICT
 
+
 class Split_tag(Step):
     """
     Features
@@ -20,6 +21,7 @@ class Split_tag(Step):
     Output
     - `fastq/{tag}_{1,2}.fq` Fastq files of each tag.
     """
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
@@ -45,7 +47,6 @@ class Split_tag(Step):
 
             self.tag_read_index_dict = defaultdict(set)
 
-
     @utils.add_log
     def write_r2_fastq_files(self):
         read_num = 0
@@ -73,25 +74,26 @@ class Split_tag(Step):
                 for tag in self.tag_read_index_dict:
                     if read_index in self.tag_read_index_dict[tag]:
                         self.r1_fastq_files_handle[tag].write(str(read) + '\n')
-        
+
         for tag in self.r1_fastq_files_handle:
             self.r1_fastq_files_handle[tag].close()
 
-
     @utils.add_log
     def run(self):
         if self.args.split_fastq:
             self.write_r2_fastq_files()
             self.write_r1_fastq_files()
 
+
 def split_tag(args):
     step_name = "split_tag"
     runner = Split_tag(args, step_name)
     runner.run()
 
+
 def get_opts_split_tag(parser, sub_program):
     parser.add_argument(
-        "--split_fastq", 
+        "--split_fastq",
         help="If used, will split scRNA-Seq fastq file according to tag assignment.",
         action='store_true',
     )
@@ -100,4 +102,3 @@ def get_opts_split_tag(parser, sub_program):
         parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True)
         parser.add_argument("--R1_read", help='R1 read path.')
         s_common(parser)
-
diff --git a/celescope/tag/tests.py b/celescope/tag/tests.py
index 446f17c2..42bedeb3 100644
--- a/celescope/tag/tests.py
+++ b/celescope/tag/tests.py
@@ -19,4 +19,4 @@ class Tests(unittest.TestCase):
             f'--matrix_10X {matrix_10X} '
         )
         print(cmd)
-        subprocess.check_call(cmd, shell=True)
\ No newline at end of file
+        subprocess.check_call(cmd, shell=True)
diff --git a/celescope/tcr_fl/__init__.py b/celescope/tcr_fl/__init__.py
index 55c8fc6b..116f69bb 100755
--- a/celescope/tcr_fl/__init__.py
+++ b/celescope/tcr_fl/__init__.py
@@ -1,2 +1,2 @@
 __STEPS__ = ['sample', 'barcode', 'cutadapt', 'split_fq', 'assemble']
-__ASSAY__ = 'tcr_fl'
\ No newline at end of file
+__ASSAY__ = 'tcr_fl'
diff --git a/celescope/tcr_fl/assemble.py b/celescope/tcr_fl/assemble.py
index 1fae3006..55aae9e4 100755
--- a/celescope/tcr_fl/assemble.py
+++ b/celescope/tcr_fl/assemble.py
@@ -29,6 +29,7 @@ def tracer(fq, outdir):
     )
     subprocess.check_call(cmd, shell=True)
 
+
 class Assemble_TCR(Step):
     def __init__(self, args, step):
         Step.__init__(self, args, step)
diff --git a/celescope/tcr_fl/barcode_index.py b/celescope/tcr_fl/barcode_index.py
index 1e05644a..8ab80a11 100755
--- a/celescope/tcr_fl/barcode_index.py
+++ b/celescope/tcr_fl/barcode_index.py
@@ -25,6 +25,3 @@ class Barcode_index():
         write index-barcode to file
         """
         self.df_index.to_csv(file_name, sep='\t')
-        
-    
-    
\ No newline at end of file
diff --git a/celescope/tcr_fl/multi_tcr_fl.py b/celescope/tcr_fl/multi_tcr_fl.py
index d4e7e037..b9bcd780 100755
--- a/celescope/tcr_fl/multi_tcr_fl.py
+++ b/celescope/tcr_fl/multi_tcr_fl.py
@@ -40,5 +40,6 @@ def main():
     multi = Multi_tcr_fl(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
diff --git a/celescope/tcr_fl/split_fq.py b/celescope/tcr_fl/split_fq.py
index c46c0538..b6b578c8 100755
--- a/celescope/tcr_fl/split_fq.py
+++ b/celescope/tcr_fl/split_fq.py
@@ -24,7 +24,7 @@ def get_nCell_barcodes(fq, nCell):
     for barcode in count_dict:
         barcode_dict[barcode] = len(count_dict[barcode])
     barcodes = pd.DataFrame.from_dict(barcode_dict, orient='index').sort_values(
-        0, ascending=False).iloc[0:nCell,].index
+        0, ascending=False).iloc[0:nCell, ].index
     return barcodes
 
 
@@ -46,7 +46,7 @@ def split_run(fq, fq_outdir, barcodes=None, nCell=None):
             if barcode in barcodes:
                 cell_index = bi.index_dict[barcode]
                 entry_dict[cell_index].append(entry)
-                
+
     # write to file
     for cell_index in entry_dict:
         with open(f'{fq_outdir}/{cell_index}.fq', 'w') as f:
@@ -69,10 +69,11 @@ def split_fq(args):
     fq_outdir = f'{args.outdir}/fastq'
     if nCell and nCell != 'None':
         nCell = int(nCell)
-    bi = split_run(args.fq, fq_outdir, barcodes, nCell) 
+    bi = split_run(args.fq, fq_outdir, barcodes, nCell)
     index_file = f'{outdir}/{sample}_index.tsv'
     bi.df_index.to_csv(index_file, sep='\t')
 
+
 def get_opts_split_fq(parser, sub_program):
     if sub_program:
         parser.add_argument('--outdir', help='output dir', required=True)
@@ -81,4 +82,4 @@ def get_opts_split_fq(parser, sub_program):
         parser.add_argument('--assay', help='assay', required=True)
     parser.add_argument(
         "--match_dir", help="match scRNA-Seq dir")
-    parser.add_argument("--nCell", help="select top N cell")
\ No newline at end of file
+    parser.add_argument("--nCell", help="select top N cell")
diff --git a/celescope/tests/conftest.py b/celescope/tests/conftest.py
index 683dc23a..25afa488 100644
--- a/celescope/tests/conftest.py
+++ b/celescope/tests/conftest.py
@@ -11,4 +11,4 @@ def pytest_generate_tests(metafunc):
     if 'assays' in metafunc.fixturenames and assays_value is not None:
         metafunc.parametrize("assays", [assays_value])
     if 'test_dir' in metafunc.fixturenames and test_dir_value is not None:
-        metafunc.parametrize("test_dir", [test_dir_value])
\ No newline at end of file
+        metafunc.parametrize("test_dir", [test_dir_value])
diff --git a/celescope/tests/test_function.py b/celescope/tests/test_function.py
index 8936c9aa..35cb83ed 100755
--- a/celescope/tests/test_function.py
+++ b/celescope/tests/test_function.py
@@ -8,8 +8,8 @@ from celescope.tools.step import Step
 class Tests(unittest.TestCase):
 
     def setUp(self):
-        pass    
-    
+        pass
+
     @unittest.skip("tested")
     def test_stat_to_metric(self):
         os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna')
@@ -28,4 +28,4 @@ class Tests(unittest.TestCase):
         print(obj.content_dict['metric'])
 
     def test_test(self):
-        assert 0 == 0
\ No newline at end of file
+        assert 0 == 0
diff --git a/celescope/tests/test_multi.py b/celescope/tests/test_multi.py
index 68c9f23d..fe853db7 100755
--- a/celescope/tests/test_multi.py
+++ b/celescope/tests/test_multi.py
@@ -34,6 +34,7 @@ def run_single(assay, test_dir):
     print("*" * 20 + "success " + assay + "*" * 20)
     return f"{assay} success."
 
+
 @utils.add_log
 def test_mutiple(assays, test_dir):
     """
@@ -57,4 +58,3 @@ def test_mutiple(assays, test_dir):
     for result in res_list:
         print(result)
     assert not any((string.find("failed") != -1 for string in res_list))
-
diff --git a/celescope/tools/analysis_mixin.py b/celescope/tools/analysis_mixin.py
index 85e92af5..03ccb2af 100755
--- a/celescope/tools/analysis_mixin.py
+++ b/celescope/tools/analysis_mixin.py
@@ -12,14 +12,14 @@ class AnalysisMixin():
     """
     mixin class for analysis
     child class must inherite Step class
-    """    
+    """
 
     def __init__(self, args):
         if hasattr(args, "match_dir") and args.match_dir:
             self.match_dir = args.match_dir
             self.read_match_dir()
         else:
-            self.match_dir = args.outdir + "/../" # use self
+            self.match_dir = args.outdir + "/../"  # use self
 
     @utils.add_log
     def seurat(self, matrix_file, save_rds, genomeDir):
@@ -37,7 +37,6 @@ class AnalysisMixin():
         AnalysisMixin.seurat.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
 
-
     @utils.add_log
     def auto_assign(self, type_marker_tsv):
         rds = f'{self.outdir}/{self.sample}.rds'
@@ -87,12 +86,12 @@ class AnalysisMixin():
         return html code
         """
 
-        avg_logfc_col = "avg_log2FC" # seurat 4
-        if "avg_logFC" in self.marker_df.columns: # seurat 2.3.4
+        avg_logfc_col = "avg_log2FC"  # seurat 4
+        if "avg_logFC" in self.marker_df.columns:  # seurat 2.3.4
             avg_logfc_col = "avg_logFC"
         marker_df = self.marker_df.loc[:,
-            ["cluster", "gene", avg_logfc_col, "pct.1", "pct.2", "p_val_adj"]
-        ]
+                                       ["cluster", "gene", avg_logfc_col, "pct.1", "pct.2", "p_val_adj"]
+                                       ]
         marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}")
 
         return marker_df
diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py
index c03b8874..f7d03403 100755
--- a/celescope/tools/barcode.py
+++ b/celescope/tools/barcode.py
@@ -16,6 +16,7 @@ from celescope.tools.step import Step, s_common
 
 MIN_T = 10
 
+
 def seq_ranges(seq, pattern_dict):
     # get subseq with intervals in arr and concatenate
     return ''.join([seq[x[0]:x[1]]for x in pattern_dict])
@@ -222,7 +223,7 @@ class Chemistry():
                 else:
                     linker_wrong_dict[linker] += 1
 
-        percent_T4 =  T4_n / self.nRead 
+        percent_T4 = T4_n / self.nRead
         percent_L57C = L57C_n / self.nRead
         Chemistry.get_chemistry.logger.info(f'percent T4: {percent_T4}')
         Chemistry.get_chemistry.logger.info(f'percent L57C: {percent_L57C}')
@@ -302,7 +303,7 @@ class Barcode(Step):
         self.lowQual = args.lowQual
         self.allowNoPolyT = args.allowNoPolyT
         self.allowNoLinker = args.allowNoLinker
-        self.nopolyT = args.nopolyT # true == output nopolyT reads
+        self.nopolyT = args.nopolyT  # true == output nopolyT reads
         self.noLinker = args.noLinker
 
         # out file
@@ -318,7 +319,6 @@ class Barcode(Step):
             self.noLinker_1 = f'{self.outdir}/noLinker_1.fq'
             self.noLinker_2 = f'{self.outdir}/noLinker_2.fq'
 
-
     @utils.add_log
     def run(self):
         """
@@ -417,7 +417,7 @@ class Barcode(Step):
                                 '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2))
                         continue
 
-                # lowQual filter                
+                # lowQual filter
                 C_U_quals_ascii = seq_ranges(
                     qual1, pattern_dict['C'] + pattern_dict['U'])
                 # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii]
@@ -440,7 +440,7 @@ class Barcode(Step):
                         continue
                     elif bool_corrected:
                         self.linker_corrected_num += 1
-                
+
                 # barcode filter
                 seq_list = get_seq_list(seq1, pattern_dict, 'C')
                 if bool_whitelist:
@@ -498,11 +498,11 @@ class Barcode(Step):
         '''
         with open(self.stat_file, 'w') as fh:
             stat_info = stat_info % (utils.format_number(self.total_num), utils.format_number(self.clean_num),
-                                    cal_percent(self.clean_num), BarcodesQ30,
-                                    UMIsQ30)
+                                     cal_percent(self.clean_num), BarcodesQ30,
+                                     UMIsQ30)
             stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M)
             fh.write(stat_info)
-        
+
         self.clean_up()
 
 
@@ -515,13 +515,13 @@ def barcode(args):
 
 def get_opts_barcode(parser, sub_program=True):
     parser.add_argument(
-        '--chemistry', 
+        '--chemistry',
         help="""Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
 - `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
 - `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
 - `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
 same time.""",
-        choices=list(__PATTERN_DICT__.keys()), 
+        choices=list(__PATTERN_DICT__.keys()),
         default='auto'
     )
     parser.add_argument(
@@ -538,14 +538,14 @@ same time.""",
         help='Cell barcode whitelist file path, one cell barcode per line.'
     )
     parser.add_argument(
-        '--linker', 
+        '--linker',
         help='Linker whitelist file path, one linker per line.'
     )
     parser.add_argument(
-        '--lowQual', 
+        '--lowQual',
         help='Default 0. Bases in cell barcode and UMI whose phred value are lower than \
 lowQual will be regarded as low-quality bases.',
-        type=int, 
+        type=int,
         default=0
     )
     parser.add_argument(
@@ -560,23 +560,23 @@ lowQual will be regarded as low-quality bases.',
         action='store_true',
     )
     parser.add_argument(
-        '--noLinker', 
+        '--noLinker',
         help='Outputs R1 reads without correct linker.',
         action='store_true',
     )
     parser.add_argument(
-        '--allowNoPolyT', 
-        help="Allow valid reads without polyT.", 
+        '--allowNoPolyT',
+        help="Allow valid reads without polyT.",
         action='store_true'
     )
     parser.add_argument(
-        '--allowNoLinker', 
-        help="Allow valid reads without correct linker.", 
+        '--allowNoLinker',
+        help="Allow valid reads without correct linker.",
         action='store_true'
     )
     parser.add_argument(
-        '--gzip', 
-        help="Output gzipped fastq files.", 
+        '--gzip',
+        help="Output gzipped fastq files.",
         action='store_true'
     )
     if sub_program:
diff --git a/celescope/tools/cellranger3/cell_calling_3.py b/celescope/tools/cellranger3/cell_calling_3.py
index 54f15d14..a47f6513 100755
--- a/celescope/tools/cellranger3/cell_calling_3.py
+++ b/celescope/tools/cellranger3/cell_calling_3.py
@@ -56,7 +56,7 @@ def estimate_profile_sgt(matrix, barcode_indices, nz_feat):
       profile (np.array(float)): Estimated probabilities of length len(nz_feat).
     """
     # Initial profile estimate
-    prof_mat = matrix[:,barcode_indices]
+    prof_mat = matrix[:, barcode_indices]
 
     profile = np.ravel(prof_mat[nz_feat, :].sum(axis=1))
     zero_feat = np.flatnonzero(profile == 0)
@@ -105,13 +105,13 @@ def find_nonambient_barcodes(raw_mat, recovered_cells,
     TBD
     """
     NonAmbientBarcodeResult = namedtuple('NonAmbientBarcodeResult',
-                                     ['eval_bcs',      # Candidate barcode indices (n)
-                                      'log_likelihood',# Ambient log likelihoods (n)
-                                      'pvalues',       # pvalues (n)
-                                      'pvalues_adj',   # B-H adjusted pvalues (n)
-                                      'is_nonambient', # Boolean nonambient calls (n)
-                                      ])
-    
+                                         ['eval_bcs',      # Candidate barcode indices (n)
+                                          'log_likelihood',  # Ambient log likelihoods (n)
+                                          'pvalues',       # pvalues (n)
+                                          'pvalues_adj',   # B-H adjusted pvalues (n)
+                                          'is_nonambient',  # Boolean nonambient calls (n)
+                                          ])
+
     # Estimate an ambient RNA profile
     umis_per_bc = np.squeeze(np.asarray(raw_mat.sum(axis=0)))
     # get the index of sorted umis_per_bc (ascending, bc_order[0] is the index of the smallest element in umis_per_bc)
@@ -130,47 +130,48 @@ def find_nonambient_barcodes(raw_mat, recovered_cells,
 
     if len(use_bcs) > 0:
         try:
-            ## Get used "Gene" features (eval_features)
-            ## and the smoothed prob profile per "Gene" (ambient_profile_p)
+            # Get used "Gene" features (eval_features)
+            # and the smoothed prob profile per "Gene" (ambient_profile_p)
             eval_features, ambient_profile_p = est_background_profile_sgt(raw_mat.tocsc(), use_bcs)
         except cr_sgt.SimpleGoodTuringError as e:
             print(str(e))
     else:
         eval_features = np.zeros(0, dtype=int)
         ambient_profile_p = np.zeros(0)
-    
-    ### Choose candidate cell barcodes
-    ### Regular ordmag filter
-    gg_filtered_indices, gg_filtered_metrics, _msg = cr_stats.filter_cellular_barcodes_ordmag(umis_per_bc, recovered_cells=recovered_cells)
+
+    # Choose candidate cell barcodes
+    # Regular ordmag filter
+    gg_filtered_indices, gg_filtered_metrics, _msg = cr_stats.filter_cellular_barcodes_ordmag(
+        umis_per_bc, recovered_cells=recovered_cells)
 
     print('Cell-called barcodes metrics:')
     print('\n'.join(list(map(lambda x: '{}: {}'.format(*x), list(gg_filtered_metrics.items())))))
     print('==============================')
-    
+
     orig_cell_bc_set = set(gg_filtered_indices)
     orig_cells = np.flatnonzero(np.fromiter((bc in orig_cell_bc_set for bc in range(raw_mat.shape[1])), dtype=bool))
 
-    ## No good incoming cell calls
+    # No good incoming cell calls
     if orig_cells.sum() == 0:
         print('Error: No original cells are selected!')
         return None, None, None
 
-    ## Look at non-cell barcodes above a minimum UMI count
+    # Look at non-cell barcodes above a minimum UMI count
     eval_bcs = np.ma.array(np.arange(raw_mat.shape[1]))
     eval_bcs[orig_cells] = ma.masked
 
     median_initial_umis = np.median(umis_per_bc[orig_cells])
-    
+
     min_umis = int(max(min_umis_nonambient, round(np.ceil(median_initial_umis * min_umi_frac_of_median))))
-    
+
     print('Median UMIs of initial cell calls: {}'.format(median_initial_umis))
     print('Min UMIs: {}'.format(min_umis))
 
     eval_bcs[umis_per_bc < min_umis] = ma.masked
     n_unmasked_bcs = len(eval_bcs) - eval_bcs.mask.sum()
 
-    ## Take the top N_CANDIDATE_BARCODES by UMI count, of barcodes that pass the above criteria
-    ## For evaluation of non-ambient bcs using background info estimated from SGT
+    # Take the top N_CANDIDATE_BARCODES by UMI count, of barcodes that pass the above criteria
+    # For evaluation of non-ambient bcs using background info estimated from SGT
     eval_bcs = np.argsort(ma.masked_array(umis_per_bc, mask=eval_bcs.mask))[:n_unmasked_bcs][-N_CANDIDATE_BARCODES:]
 
     if len(eval_bcs) == 0:
@@ -193,7 +194,7 @@ def find_nonambient_barcodes(raw_mat, recovered_cells,
         obs_loglk = cr_stats.eval_multinomial_loglikelihoods(eval_mat, ambient_profile_p)
 
         # Simulate log likelihoods
-        distinct_ns, sim_loglk = cr_stats.simulate_multinomial_loglikelihoods(ambient_profile_p, umis_per_bc[eval_bcs], 
+        distinct_ns, sim_loglk = cr_stats.simulate_multinomial_loglikelihoods(ambient_profile_p, umis_per_bc[eval_bcs],
                                                                               num_sims=10000, verbose=True)
 
         # Compute p-values
@@ -205,10 +206,10 @@ def find_nonambient_barcodes(raw_mat, recovered_cells,
 
         print('Number of non-ambient barcodes from SGT:', len(eval_bcs[is_nonambient]))
 
-        ## Runxi's filtering
+        # Runxi's filtering
         print('Identify {} cell-associated barcodes'.format(len(orig_cells)+len(eval_bcs[is_nonambient])))
 
-        ## of barcodes overlapped w/ the cellranger results
+        # of barcodes overlapped w/ the cellranger results
         filtered_bc_indices = np.concatenate((orig_cells, eval_bcs[is_nonambient]), axis=None)
 
         return filtered_bc_indices, gg_filtered_metrics, NonAmbientBarcodeResult(
@@ -223,10 +224,10 @@ def find_nonambient_barcodes(raw_mat, recovered_cells,
 def cell_calling_3(all_matrix_10X_dir, expected_cell_num):
 
     raw_mat_path = os.path.join(all_matrix_10X_dir, MATRIX_FILE_NAME)
-    raw_mat = scipy.io.mmread(raw_mat_path) # scipy.sparse.coo.coo_matrix
+    raw_mat = scipy.io.mmread(raw_mat_path)  # scipy.sparse.coo.coo_matrix
 
     raw_features_path = os.path.join(all_matrix_10X_dir, FEATURE_FILE_NAME)
-    raw_features_df = pd.read_csv(raw_features_path, sep='\t', error_bad_lines=False, names=['id','name','type'])
+    raw_features_df = pd.read_csv(raw_features_path, sep='\t', error_bad_lines=False, names=['id', 'name', 'type'])
     raw_features_df['id'].tolist()
     raw_features_df['name'].tolist()
     raw_features_df['type'].tolist()
@@ -235,10 +236,10 @@ def cell_calling_3(all_matrix_10X_dir, expected_cell_num):
     raw_barcodes_df = pd.read_csv(raw_barcodes_path, sep='\t', error_bad_lines=False, names=['barcode'])
     raw_barcodes = np.array(raw_barcodes_df['barcode'].tolist())
 
-    ### Run cell calling
+    # Run cell calling
     filtered_bc_indices, round_1_filtered_metrics, _non_ambient_barcode_result = find_nonambient_barcodes(
-        raw_mat=raw_mat,recovered_cells=expected_cell_num)
-    
+        raw_mat=raw_mat, recovered_cells=expected_cell_num)
+
     cell_bc = raw_barcodes[filtered_bc_indices]
     initial_cell_num = round_1_filtered_metrics['filtered_bcs']
-    return cell_bc, initial_cell_num
\ No newline at end of file
+    return cell_bc, initial_cell_num
diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py
index aaf5699a..85951fba 100755
--- a/celescope/tools/cellranger3/get_plot_elements.py
+++ b/celescope/tools/cellranger3/get_plot_elements.py
@@ -16,7 +16,7 @@ CHARTS_PLOTLY_MODEBAR_TRANSFORM_BUTTONS = [
     'zoomIn2d',
     'zoomOut2d',
     'autoScale2d',
-    #'resetScale2d'  can't totally disable interaction, it seems-- keep reset option
+    # 'resetScale2d'  can't totally disable interaction, it seems-- keep reset option
 ]
 
 CHARTS_PLOTLY_EXPORT_BUTTONS = [
@@ -38,11 +38,11 @@ CHARTS_PLOTLY_MOVABLE_CONFIG = {
 BC_RANK_PLOT_LINE_WIDTH = 3
 # Gradient scheme used in the barcode rank plot
 BC_PLOT_COLORS = ['#dddddd', '#d1d8dc', '#c6d3dc', '#bacfdb', '#aecada', '#a3c5d9', '#97c0d9', '#8cbbd8', '#80b7d7',
-                      '#74b2d7', '#6aadd6', '#66abd4', '#62a8d2', '#5ea5d1', '#59a2cf', '#559fce', '#519ccc', '#4d99ca',
-                      '#4997c9', '#4594c7', '#4191c5', '#3d8dc4', '#3a8ac2', '#3787c0', '#3383be', '#3080bd', '#2c7cbb',
-                      '#2979b9', '#2676b7', '#2272b6', '#1f6eb3', '#1d6ab0', '#1a65ac', '#1861a9', '#155ca6', '#1358a2',
-                      '#10539f', '#0e4f9b', '#0b4a98', '#094695', '#09438f', '#0a4189', '#0c3f83', '#0d3d7c', '#0e3b76',
-                      '#103970', '#11366a', '#123463', '#14325d', '#153057']
+                  '#74b2d7', '#6aadd6', '#66abd4', '#62a8d2', '#5ea5d1', '#59a2cf', '#559fce', '#519ccc', '#4d99ca',
+                  '#4997c9', '#4594c7', '#4191c5', '#3d8dc4', '#3a8ac2', '#3787c0', '#3383be', '#3080bd', '#2c7cbb',
+                  '#2979b9', '#2676b7', '#2272b6', '#1f6eb3', '#1d6ab0', '#1a65ac', '#1861a9', '#155ca6', '#1358a2',
+                  '#10539f', '#0e4f9b', '#0b4a98', '#094695', '#09438f', '#0a4189', '#0c3f83', '#0d3d7c', '#0e3b76',
+                  '#103970', '#11366a', '#123463', '#14325d', '#153057']
 
 CHARTS = [
     {
@@ -50,7 +50,7 @@ CHARTS = [
             'title': 'Barcode Rank',
             'width': 470,
             'height': 313,
-            'margin': { 'l': 60, 'r': 0, 't': 30, 'b': 40 },
+            'margin': {'l': 60, 'r': 0, 't': 30, 'b': 40},
             'hovermode': 'closest',
             'xaxis': {
                 'title': 'Barcodes',
diff --git a/celescope/tools/cellranger3/sgt.py b/celescope/tools/cellranger3/sgt.py
index 0fa31b23..a99a06a4 100755
--- a/celescope/tools/cellranger3/sgt.py
+++ b/celescope/tools/cellranger3/sgt.py
@@ -23,7 +23,7 @@ def _averaging_transform(r, nr):
     dr = np.concatenate((
         0.5 * (d[1:] + d[0:-1]),
         np.array((d[-1],), dtype=float),
-        ))
+    ))
     return nr.astype(float)/dr
 
 
@@ -52,9 +52,10 @@ def simple_good_turing(xr, xnr):
     slope, _intercept, _, _, _ = sp_stats.linregress(np.log(xr), np.log(xnrz))
 
     if slope > -1:
-        raise SimpleGoodTuringError("The log-log slope is > -1 (%d); the SGT estimator is not applicable to these data." % slope)
+        raise SimpleGoodTuringError(
+            "The log-log slope is > -1 (%d); the SGT estimator is not applicable to these data." % slope)
 
-    xrst = _rstest(xr,slope)
+    xrst = _rstest(xr, slope)
     xrstrel = xrst/xr
 
     # Get traditional Good-Turing estimate
@@ -72,7 +73,7 @@ def simple_good_turing(xr, xnr):
     useturing = True
     for r in range(len(xr)):
         if not useturing:
-            xrstcmbrel[r]  = xrstrel[r]
+            xrstcmbrel[r] = xrstrel[r]
         else:
             if np.abs(xrstrel[r]-xrstarel[r]) * (1+r)/tursd[r] > 1.65:
                 xrstcmbrel[r] = xrstarel[r]
diff --git a/celescope/tools/cellranger3/stats.py b/celescope/tools/cellranger3/stats.py
index b435844f..27f931c9 100755
--- a/celescope/tools/cellranger3/stats.py
+++ b/celescope/tools/cellranger3/stats.py
@@ -186,13 +186,13 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells):
         that likely represents a cell
     """
     if recovered_cells is None:
-        ### Modified parameter, didn't use the default value
+        # Modified parameter, didn't use the default value
         recovered_cells = 3000
 #         recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP # 3000
 
-    ## Initialize filter result metrics
+    # Initialize filter result metrics
     metrics = init_barcode_filter_result()
-    ## determine max # of cellular barcodes to consider
+    # determine max # of cellular barcodes to consider
     max_filtered_bcs = determine_max_filtered_bcs(recovered_cells)
     metrics['max_filtered_bcs'] = max_filtered_bcs
 
@@ -202,15 +202,15 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells):
         return [], metrics, msg
 
 #     baseline_bc_idx = int(round(float(recovered_cells) * (1 - cr_constants.ORDMAG_RECOVERED_CELLS_QUANTILE))) # Quantile=0.99
-    baseline_bc_idx = int(round(float(recovered_cells) * (1 - 0.99))) # Quantile=0.99
+    baseline_bc_idx = int(round(float(recovered_cells) * (1 - 0.99)))  # Quantile=0.99
     baseline_bc_idx = min(baseline_bc_idx, len(nonzero_bc_counts) - 1)
     assert baseline_bc_idx < max_filtered_bcs
 
     # Bootstrap sampling; run algo with many random samples of the data
     top_n_boot = np.array([
         find_within_ordmag(np.random.choice(nonzero_bc_counts, len(nonzero_bc_counts)), baseline_bc_idx)
-        for i in range(100) # 100
-#         for i in range(cr_constants.ORDMAG_NUM_BOOTSTRAP_SAMPLES) # 100
+        for i in range(100)  # 100
+        #         for i in range(cr_constants.ORDMAG_NUM_BOOTSTRAP_SAMPLES) # 100
     ])
 
     metrics.update(summarize_bootstrapped_top_n(top_n_boot))
@@ -224,9 +224,9 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells):
 def filter_cellular_barcodes_fixed_cutoff(bc_counts, cutoff):
     nonzero_bcs = len(bc_counts[bc_counts > 0])
     top_n = min(cutoff, nonzero_bcs)
-    ## np.argsort(bc_counts) => the indices that would sort an array
-    ## np.argsort(bc_counts)[0] => idx of the smallest element in array
-    ## np.argsort(bc_counts)[-1] => idx of the largest element in array
+    # np.argsort(bc_counts) => the indices that would sort an array
+    # np.argsort(bc_counts)[0] => idx of the smallest element in array
+    # np.argsort(bc_counts)[-1] => idx of the largest element in array
     top_bc_idx = np.sort(np.argsort(bc_counts)[::-1][:top_n])
     metrics = {
         'filtered_bcs': top_n,
@@ -354,7 +354,7 @@ def eval_multinomial_loglikelihoods(matrix, profile_p, max_mem_gb=0.1):
 
     for chunk_start in range(0, num_bcs, bcs_per_chunk):
         chunk = slice(chunk_start, chunk_start+bcs_per_chunk)
-        matrix_chunk = matrix[:,chunk].transpose().toarray()
+        matrix_chunk = matrix[:, chunk].transpose().toarray()
         n = matrix_chunk.sum(1)
         loglk[chunk] = sp_stats.multinomial.logpmf(matrix_chunk, n, p=profile_p)
     return loglk
@@ -415,7 +415,8 @@ def simulate_multinomial_loglikelihoods(profile_p, umis_per_bc,
                     k += 1
                     if k >= n_sample_feature_block:
                         # Amortize this operation
-                        sampled_features = np.random.choice(len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True)
+                        sampled_features = np.random.choice(
+                            len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True)
                         k = 0
                     curr_counts[j] += 1
                     curr_loglk += log_profile_p[j] + np.log(float(n)/curr_counts[j])
@@ -450,6 +451,6 @@ def compute_ambient_pvalues(umis_per_bc, obs_loglk, sim_n, sim_loglk):
     pvalues = np.zeros(num_barcodes)
 
     for i in range(num_barcodes):
-        num_lower_loglk = np.sum(sim_loglk[sim_n_idx[i],:] < obs_loglk[i])
+        num_lower_loglk = np.sum(sim_loglk[sim_n_idx[i], :] < obs_loglk[i])
         pvalues[i] = float(1 + num_lower_loglk) / (1 + num_sims)
-    return pvalues
\ No newline at end of file
+    return pvalues
diff --git a/celescope/tools/consensus.py b/celescope/tools/consensus.py
index 6492e7c9..0ae2eeba 100755
--- a/celescope/tools/consensus.py
+++ b/celescope/tools/consensus.py
@@ -18,6 +18,7 @@ class Consensus(Step):
     Output
     - `{sample}_consensus.fq` Consensus fastq.
     """
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
@@ -33,8 +34,8 @@ class Consensus(Step):
 
         sort_fastq(self.args.fq, self.fq_tmp_file, self.outdir)
         n, total_ambiguous_base_n, length_list = sorted_dumb_consensus(
-            fq=self.fq_tmp_file, 
-            outfile=self.consensus_fq, 
+            fq=self.fq_tmp_file,
+            outfile=self.consensus_fq,
             threshold=self.args.threshold
         )
 
@@ -54,7 +55,6 @@ class Consensus(Step):
         self.clean_up()
 
 
-
 @utils.add_log
 def sort_fastq(fq, fq_tmp_file, outdir):
     tmp_dir = f'{outdir}/tmp'
@@ -96,7 +96,7 @@ def sorted_dumb_consensus(fq, outfile, threshold):
                 sorted_dumb_consensus.logger.info(f'{n_umi} UMI done.')
             total_ambiguous_base_n += ambiguous_base_n
             length_list.append(con_len)
-    
+
     out_h.close()
     return n_umi, total_ambiguous_base_n, length_list
 
@@ -159,7 +159,7 @@ def get_read_length(read_list, threshold=0.5):
     length = max length with read fraction >= threshold
     elements of read_list: [entry.sequence,entry.quality]
     '''
-    
+
     n_read = len(read_list)
     length_dict = defaultdict(int)
     for read in read_list:
@@ -169,11 +169,12 @@ def get_read_length(read_list, threshold=0.5):
         length_dict[length] = length_dict[length] / n_read
 
     fraction = 0
-    for length in sorted(length_dict.keys(),reverse=True):
+    for length in sorted(length_dict.keys(), reverse=True):
         fraction += length_dict[length]
         if fraction >= threshold:
             return length
 
+
 @utils.add_log
 def consensus(args):
 
@@ -181,9 +182,10 @@ def consensus(args):
     consensus_obj = Consensus(args, step_name)
     consensus_obj.run()
 
+
 def get_opts_consensus(parser, sub_program):
     parser.add_argument("--threshold", help='Default 0.5. Valid base threshold. ', type=float, default=0.5)
     parser.add_argument("--not_consensus", help="Skip the consensus step. ", action='store_true')
     if sub_program:
         parser.add_argument("--fq", help="Required. Fastq file.", required=True)
-        s_common(parser)
\ No newline at end of file
+        s_common(parser)
diff --git a/celescope/tools/count.py b/celescope/tools/count.py
index b532f646..b3da47f1 100755
--- a/celescope/tools/count.py
+++ b/celescope/tools/count.py
@@ -71,6 +71,7 @@ class Count(Step):
 
 
     """
+
     def __init__(self, args, step):
         Step.__init__(self, args, step)
         self.force_cell_num = args.force_cell_num
@@ -186,7 +187,7 @@ class Count(Step):
         with open(self.count_detail_file, 'wt') as fh1:
             fh1.write('\t'.join(['Barcode', 'geneID', 'UMI', 'count']) + '\n')
 
-            def keyfunc(x): 
+            def keyfunc(x):
                 return x.query_name.split('_', 1)[0]
             for _, g in groupby(samfile, keyfunc):
                 gene_umi_dict = defaultdict(lambda: defaultdict(int))
@@ -477,16 +478,16 @@ def get_opts_count(parser, sub_program):
     parser.add_argument('--genomeDir', help='Required. Genome directory.')
     parser.add_argument('--expected_cell_num', help='Default `3000`. Expected cell number.', default=3000)
     parser.add_argument(
-        '--cell_calling_method', 
+        '--cell_calling_method',
         help='Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.',
-        choices=['auto', 'cellranger3', 'inflection', ], 
+        choices=['auto', 'cellranger3', 'inflection', ],
         default='auto',
     )
     if sub_program:
         parser = s_common(parser)
         parser.add_argument('--bam', help='Required. BAM file from featureCounts.', required=True)
         parser.add_argument(
-            '--force_cell_num', 
-            help='Default `None`. Force the cell number to be this value ± 10%.', 
+            '--force_cell_num',
+            help='Default `None`. Force the cell number to be this value ± 10%.',
             default=None
         )
diff --git a/celescope/tools/cutadapt.py b/celescope/tools/cutadapt.py
index 82fe66f8..cb596d93 100755
--- a/celescope/tools/cutadapt.py
+++ b/celescope/tools/cutadapt.py
@@ -15,8 +15,8 @@ class Cutadapt(Step):
     """
     Features
     - Trim adapters in R2 reads with cutadapt. Default adapters includes:
-	- polyT=A{18}, 18 A bases. 
-	- p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
+        - polyT=A{18}, 18 A bases. 
+        - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
 
     Output
     - `cutadapt.log` Cutadapt output log file.
@@ -38,7 +38,6 @@ class Cutadapt(Step):
         self.out_fq2 = f'{self.outdir}/{self.sample}_clean_2.fq{suffix}'
         self.cutadapt_log_file = f'{self.outdir}/cutadapt.log'
 
-
     @staticmethod
     def read_adapter_fasta(adapter_fasta):
         '''
@@ -100,7 +99,7 @@ class Cutadapt(Step):
         Cutadapt.run.logger.info(cmd)
         # need encoding argument to return str
         results = subprocess.run(
-            cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, 
+            cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE,
             encoding='utf-8', check=True, shell=True
         )
         cutadapt_log = results.stdout
@@ -110,7 +109,7 @@ class Cutadapt(Step):
         self.clean_up()
 
 
-@utils.add_log  
+@utils.add_log
 def cutadapt(args):
 
     step_name = "cutadapt"
@@ -122,7 +121,7 @@ def get_opts_cutadapt(parser, sub_program):
     parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.')
     parser.add_argument(
         '--minimum_length',
-        help='Default `20`. Discard processed reads that are shorter than LENGTH.', 
+        help='Default `20`. Discard processed reads that are shorter than LENGTH.',
         default=20
     )
     parser.add_argument(
@@ -132,7 +131,7 @@ Some Illumina instruments use a two-color chemistry to encode the four bases.
 This includes the NextSeq and the NovaSeq. 
 In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
 However, dark cycles also occur when sequencing “falls off” the end of the fragment.
-The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.""", 
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.""",
         default=20,
     )
     parser.add_argument(
@@ -141,12 +140,12 @@ The read then contains a run of high-quality, but incorrect “G” calls at its
 short matches can occur by chance, leading to erroneously trimmed bases. 
 For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
 To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
-at least {overlap} bases match between adapter and read. """, 
+at least {overlap} bases match between adapter and read. """,
         default=10
     )
     parser.add_argument(
-        '--insert', 
-        help="Default `150`. Read2 insert length.", 
+        '--insert',
+        help="Default `150`. Read2 insert length.",
         default=150
     )
     if sub_program:
@@ -154,5 +153,3 @@ at least {overlap} bases match between adapter and read. """,
         parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true')
         parser = s_common(parser)
     return parser
-
-
diff --git a/celescope/tools/debug.py b/celescope/tools/debug.py
index add4f065..c4d05edd 100755
--- a/celescope/tools/debug.py
+++ b/celescope/tools/debug.py
@@ -54,7 +54,6 @@ class Debug():
         with open('fastqc.sh', 'wt') as f:
             f.write(cmd)
 
-
     def run(self):
         self.run_subsample()
         self.run_STAR()
@@ -64,5 +63,3 @@ class Debug():
 if __name__ == '__main__':
     de = Debug()
     de.run()
-
-    
\ No newline at end of file
diff --git a/celescope/tools/featureCounts.py b/celescope/tools/featureCounts.py
index fc96eb0c..878a784a 100755
--- a/celescope/tools/featureCounts.py
+++ b/celescope/tools/featureCounts.py
@@ -82,7 +82,7 @@ class FeatureCounts(Step):
             'featureCounts '
             '-s 1 '
             f'-a {self.gtf} '
-            f'-o {self.out_prefix} ' # not bam
+            f'-o {self.out_prefix} '  # not bam
             '-R BAM '
             f'-T {self.thread} '
             f'-t {self.args.gtf_type} '
@@ -90,7 +90,7 @@ class FeatureCounts(Step):
         )
         FeatureCounts.run_featureCounts.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
-    
+
     @add_log
     def name_sort_bam(self):
         cmd = (
@@ -102,7 +102,6 @@ class FeatureCounts(Step):
         FeatureCounts.name_sort_bam.logger.info(cmd)
         subprocess.check_call(cmd, shell=True)
 
-
     def run(self):
         self.run_featureCounts()
         add_tag(self.featureCounts_bam, self.gtf)
@@ -153,4 +152,3 @@ def get_opts_featureCounts(parser, sub_program):
         parser.add_argument('--input', help='Required. BAM file path.', required=True)
         parser = s_common(parser)
     return parser
-
diff --git a/celescope/tools/mkref.py b/celescope/tools/mkref.py
index e633995b..9e243459 100755
--- a/celescope/tools/mkref.py
+++ b/celescope/tools/mkref.py
@@ -29,7 +29,7 @@ class Mkref():
 
         # out file
         self.config_file = f'{self.genomeDir}/{GENOME_CONFIG}'
-    
+
     @abc.abstractmethod
     def run(self):
         return
diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index 9fed21b8..c1bbd882 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -23,7 +23,7 @@ class Multi():
         self.last_step = ''
         self.args = None
         self.steps_not_run = ['mkref']
-        
+
         # remove
         for step in self.steps_not_run:
             if step in self.__STEPS__:
@@ -46,9 +46,9 @@ class Multi():
 
     def common_args(self):
         readme = f'{self.__ASSAY__} multi-samples'
-        parser = argparse.ArgumentParser(readme, 
-            formatter_class=ArgFormatter,
-            conflict_handler='resolve')
+        parser = argparse.ArgumentParser(readme,
+                                         formatter_class=ArgFormatter,
+                                         conflict_handler='resolve')
         parser.add_argument('--mod', help='mod, sjm or shell', choices=['sjm', 'shell'], default='sjm')
         parser.add_argument(
             '--mapfile',
@@ -100,7 +100,6 @@ class Multi():
                     fq_dict[sample_name] = [[fq1], [fq2]]
                     col4_dict[sample_name] = col4
 
-
         for sample_name in fq_dict:
             fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0])
             fq_dict[sample_name][1] = ",".join(fq_dict[sample_name][1])
@@ -147,7 +146,7 @@ class Multi():
                 step_outdir = f"{self.args.outdir}/{sample}/{index:02d}.{step}"
                 self.outdir_dic[sample].update({step: step_outdir})
                 index += 1
-    
+
     def generate_cmd(self, cmd, step, sample, m=1, x=1):
         if sample:
             sample = "_" + sample
@@ -199,7 +198,7 @@ job_end
                 if args_dict[arg]:
                     matches = [' ', '-']
                     arg_string = str(args_dict[arg])
-                    if any(char in arg_string for char in matches): # need quote
+                    if any(char in arg_string for char in matches):  # need quote
                         cmd_line += f'--{arg} "{arg_string}" '
                     else:
                         cmd_line += f'--{arg} {arg_string} '
@@ -215,7 +214,7 @@ job_end
             f'--fq1 {arr[0]} '
         )
         self.process_cmd(cmd, step, sample, m=1, x=1)
-    
+
     def barcode(self, sample):
         step = "barcode"
         arr = self.fq_dict[sample]
@@ -304,7 +303,7 @@ job_end
                     ) from attr_not_exist
                 method_to_call(sample)
 
-    def merge_report(self):    
+    def merge_report(self):
         step = "merge_report"
         steps_str = ",".join(self.__STEPS__)
         samples = ','.join(self.fq_dict.keys())
diff --git a/celescope/tools/report.py b/celescope/tools/report.py
index 0aaf45ba..08c4a7cc 100755
--- a/celescope/tools/report.py
+++ b/celescope/tools/report.py
@@ -32,7 +32,6 @@ class reporter:
 
     def get_report(self):
 
-
         json_file = self.outdir + '/.data.json'
         if not os.path.exists(json_file):
             data = {}
@@ -58,7 +57,7 @@ class reporter:
 
         if isinstance(self.df, pd.DataFrame):
             df = self.df.fillna(value="")
-            data[self.name + '_table'] = df.values.tolist()        
+            data[self.name + '_table'] = df.values.tolist()
 
         if self.table_header:
             data[self.name + '_table_header'] = self.table_header
@@ -74,4 +73,3 @@ class reporter:
 
         with open(json_file, 'w') as fh:
             json.dump(data, fh, indent=4)
-
diff --git a/celescope/tools/sample.py b/celescope/tools/sample.py
index 48422696..f8c91fb6 100755
--- a/celescope/tools/sample.py
+++ b/celescope/tools/sample.py
@@ -11,7 +11,7 @@ from celescope.tools.step import Step, s_common
 
 @utils.add_log
 def sample(args):
-    
+
     step_name = "sample"
     step = Step(args, step_name)
 
@@ -30,7 +30,6 @@ def sample(args):
         chemistry = ",".join(set(chemistry))
     else:
         chemistry = args.chemistry
-    
 
     if not os.path.exists(outdir):
         os.system('mkdir -p %s' % outdir)
@@ -38,7 +37,7 @@ def sample(args):
     stat = pd.DataFrame({
         "item": ["Sample ID", "Assay", "Chemistry", "Software Version"],
         "count": [sample_name, assay_description, chemistry, version],
-        },
+    },
         columns=["item", "count"]
     )
     stat_file = outdir + "/stat.txt"
@@ -55,4 +54,3 @@ def get_opts_sample(parser, sub_program):
         parser.add_argument('--fq1', help='read1 fq file')
     parser.add_argument('--chemistry', choices=list(__PATTERN_DICT__.keys()), help='chemistry version', default='auto')
     return parser
-    
diff --git a/celescope/tools/star_mixin.py b/celescope/tools/star_mixin.py
index 3694959d..a5cfa9cf 100755
--- a/celescope/tools/star_mixin.py
+++ b/celescope/tools/star_mixin.py
@@ -10,6 +10,7 @@ class StarMixin():
     """
     Mixin class for STAR
     """
+
     def __init__(self, args, add_prefix=None):
         self.fq = args.fq
         self.genomeDir = args.genomeDir
@@ -33,7 +34,7 @@ class StarMixin():
         self.STAR_map_log = f'{self.outPrefix}Log.final.out'
         self.unsort_STAR_bam = f'{self.outPrefix}Aligned.out.bam'
         self.STAR_bam = f'{self.outPrefix}Aligned.sortedByCoord.out.bam'
-    
+
     @utils.add_log
     def STAR(self):
         cmd = [
@@ -43,7 +44,7 @@ class StarMixin():
             '--readFilesIn', self.fq,
             '--outFilterMultimapNmax', str(self.multi_max),
             '--outFileNamePrefix', self.outPrefix,
-            '--outSAMtype', 'BAM', 'Unsorted', # controls sort by Coordinate or not
+            '--outSAMtype', 'BAM', 'Unsorted',  # controls sort by Coordinate or not
             '--outFilterMatchNmin', str(self.outFilterMatchNmin)
         ]
         if self.out_unmapped:
@@ -72,7 +73,7 @@ class StarMixin():
 
     @utils.add_log
     def index_bam(self):
-        utils.index_bam(self.STAR_bam)    
+        utils.index_bam(self.STAR_bam)
 
     def get_star_metrics(self):
         """
@@ -113,29 +114,29 @@ class StarMixin():
 
 def get_opts_star_mixin(parser, sub_program):
     parser.add_argument(
-        '--genomeDir', 
+        '--genomeDir',
         help='Required. Genome directory.'
     )
     parser.add_argument(
-        '--outFilterMatchNmin', 
+        '--outFilterMatchNmin',
         help="""Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.""", 
+is higher than or equal to this value.""",
         default=0
     )
     parser.add_argument(
-        '--out_unmapped', 
-        help='Output unmapped reads', 
+        '--out_unmapped',
+        help='Output unmapped reads',
         action='store_true'
     )
     parser.add_argument('--STAR_param', help='Other STAR parameters', default="")
     parser.add_argument(
-        '--outFilterMultimapNmax', 
-        help='Default `1`. How many places are allowed to match a read at most.', 
+        '--outFilterMultimapNmax',
+        help='Default `1`. How many places are allowed to match a read at most.',
         default=1
     )
     parser.add_argument(
-        '--starMem', 
-        help='Default `30`. Maximum memory that STAR can use.', 
+        '--starMem',
+        help='Default `30`. Maximum memory that STAR can use.',
         default=30
     )
     if sub_program:
diff --git a/celescope/tools/step.py b/celescope/tools/step.py
index 0c759de3..39e75b59 100755
--- a/celescope/tools/step.py
+++ b/celescope/tools/step.py
@@ -14,6 +14,7 @@ from celescope.tools.utils import add_log
 
 Metric = namedtuple("Metric", "name value total fraction")
 
+
 def s_common(parser):
     """subparser common arguments
     """
@@ -21,7 +22,8 @@ def s_common(parser):
     parser.add_argument('--assay', help='Assay name.', required=True)
     parser.add_argument('--sample', help='Sample name.', required=True)
     parser.add_argument('--thread', help='Thread to use.', default=4)
-    parser.add_argument('--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true')
+    parser.add_argument(
+        '--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true')
     return parser
 
 
@@ -29,6 +31,7 @@ class Step:
     """
     Step class
     """
+
     def __init__(self, args, step_name):
         self.step_name = step_name
         self.args = args
@@ -37,7 +40,7 @@ class Step:
         self.assay = args.assay
         self.thread = int(args.thread)
         self.debug = args.debug
-        # set 
+        # set
         self.out_prefix = f'{self.outdir}/{self.sample}'
 
         # important! make outdir before path_dict because path_dict use relative path.
@@ -113,7 +116,6 @@ class Step:
                     line += f'{fraction}%'
                 stat_handle.write(line + '\n')
 
-
     def dump_content(self, slot):
         '''dump content to json file
         '''
@@ -146,7 +148,7 @@ class Step:
         metrics = dict()
         for metric_name, string in dic.items():
             bool_fraction = False
-            bool_value = False 
+            bool_value = False
             if '%' in string:
                 bool_fraction = True
                 if "(" in string:
@@ -154,17 +156,17 @@ class Step:
             chars = [',', '%', ')']
             for character in chars:
                 string = string.replace(character, '')
-            
+
             if bool_fraction:
-                if bool_value: # case 2
+                if bool_value:  # case 2
                     value, fraction = string.split('(')
                     fraction = round(float(fraction) / 100, 4)
                     metrics[metric_name] = int(value)
                     metrics[metric_name + ' Fraction'] = fraction
-                else: # case 3
+                else:  # case 3
                     fraction = round(float(string) / 100, 4)
                     metrics[metric_name] = fraction
-            else: # case 1
+            else:  # case 1
                 value = string
                 if '.' in string:
                     try:
@@ -223,5 +225,3 @@ class Step:
     @abc.abstractmethod
     def run(self):
         return
-
-
diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py
index 5fe8aa39..b43609ba 100755
--- a/celescope/tools/target_metrics.py
+++ b/celescope/tools/target_metrics.py
@@ -57,7 +57,7 @@ class Target_metrics(Step):
                     self.count_dict[barcode][gene_name][UMI] += 1
 
     @utils.add_log
-    def parse_count_dict_add_metrics(self):        
+    def parse_count_dict_add_metrics(self):
         total_UMIs = 0
         enriched_UMIs = 0
         enriched_UMIs_in_cells = 0
@@ -80,7 +80,7 @@ class Target_metrics(Step):
         self.add_metric(
             name="Total UMIs",
             value=total_UMIs,
-        )    
+        )
 
         self.add_metric(
             name="Enriched UMIs",
@@ -122,4 +122,3 @@ def get_opts_target_metrics(parser, sub_program):
         parser.add_argument("--bam", help='Input bam file', required=True)
         parser.add_argument('--match_dir', help=HELP_DICT['match_dir'], required=True)
         parser = s_common(parser)
-
diff --git a/celescope/tools/tests.py b/celescope/tools/tests.py
index 1bba4dff..ca0d5c5f 100755
--- a/celescope/tools/tests.py
+++ b/celescope/tools/tests.py
@@ -10,6 +10,7 @@ class Tests(unittest.TestCase):
     """
     Run this test under a temp folder as it will generate some files.
     """
+
     def setUp(self):
         pass
 
@@ -33,11 +34,11 @@ class Tests(unittest.TestCase):
         step.clean_up()
 
     def test_get_read_length(self):
-        read_list = [['AAAA','FFFF'],['TTT','FFF'],['CCC','FFF'],['GGGGGGG','FFFFFFF']]
+        read_list = [['AAAA', 'FFFF'], ['TTT', 'FFF'], ['CCC', 'FFF'], ['GGGGGGG', 'FFFFFFF']]
         assert get_read_length(read_list, 0.5) == 4
 
     def test_dumb_consensus(self):
-        read_list = [('AAAA','FFFF'),('TTT','FF;'),('CCC','FFF'),('GGGGGGG','FFFFFFF')]
+        read_list = [('AAAA', 'FFFF'), ('TTT', 'FF;'), ('CCC', 'FFF'), ('GGGGGGG', 'FFFFFFF')]
         consensus_seq, consensus_qual, _ambiguous_base_n, _con_len = dumb_consensus(read_list, 0.5)
         print(consensus_qual)
         assert consensus_seq == 'NNNA'
@@ -53,10 +54,10 @@ class Tests(unittest.TestCase):
             "ccccc2": 199,
         }
         n_corrected_umi, n_corrected_read = Count.correct_umi(dic)
-        sorted_dic = sorted(dic.items(), key=lambda x:x[1])
+        sorted_dic = sorted(dic.items(), key=lambda x: x[1])
         assert sorted_dic == [('ccccc1', 20), ('apple2', 32), ('bears3', 115), ('ccccc2', 199)]
         assert n_corrected_umi == 3
-        assert n_corrected_read == 2 + 5 + 10  
+        assert n_corrected_read == 2 + 5 + 10
 
 
 if __name__ == '__main__':
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index a74388ea..2c012d14 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -48,7 +48,7 @@ def add_log(func):
     @wraps(func)
     def wrapper(*args, **kwargs):
         if args and hasattr(args[0], 'debug') and args[0].debug:
-            logger.setLevel(10) # debug
+            logger.setLevel(10)  # debug
 
         logger.info('start...')
         start = time.time()
@@ -63,10 +63,10 @@ def add_log(func):
 
 
 def using(point=""):
-    usage=resource.getrusage(resource.RUSAGE_SELF)
+    usage = resource.getrusage(resource.RUSAGE_SELF)
     return '''%s: usertime=%s systime=%s mem=%s mb
-        '''%(point,usage[0],usage[1],
-                usage[2]/1024.0)
+        ''' % (point, usage[0], usage[1],
+               usage[2]/1024.0)
 
 
 def add_mem(func):
@@ -185,6 +185,7 @@ def generic_open(file_name, *args, **kwargs):
         file_obj = open(file_name, *args, **kwargs)
     return file_obj
 
+
 @add_log
 def get_id_name_dict(gtf_file):
     """
@@ -213,23 +214,23 @@ def get_id_name_dict(gtf_file):
                 gene_id = gene_id_pattern.findall(attributes)[-1]
                 gene_names = gene_name_pattern.findall(attributes)
                 if not gene_names:
-                    gene_name = gene_id 
+                    gene_name = gene_id
                 else:
                     gene_name = gene_names[-1]
                 c[gene_name] += 1
                 if c[gene_name] > 1:
                     if gene_id in id_name:
                         assert id_name[gene_id] == gene_name, (
-                                'one gene_id with multiple gene_name '
-                                f'gene_id: {gene_id}, '
-                                f'gene_name this line: {gene_name}'
-                                f'gene_name previous line: {id_name[gene_id]}'
-                            )
+                            'one gene_id with multiple gene_name '
+                            f'gene_id: {gene_id}, '
+                            f'gene_name this line: {gene_name}'
+                            f'gene_name previous line: {id_name[gene_id]}'
+                        )
                         get_id_name_dict.logger.warning(
-                                'duplicated (gene_id, gene_name)'
-                                f'gene_id: {gene_id}, '
-                                f'gene_name {gene_name}'
-                            )
+                            'duplicated (gene_id, gene_name)'
+                            f'gene_id: {gene_id}, '
+                            f'gene_name {gene_name}'
+                        )
                         c[gene_name] -= 1
                     else:
                         gene_name = f'{gene_name}_{c[gene_name]}'
@@ -239,8 +240,8 @@ def get_id_name_dict(gtf_file):
 
 @add_log
 def process_read(
-    read2_file, pattern_dict, barcode_dict, linker_dict,
-    barcode_length, linker_length):
+        read2_file, pattern_dict, barcode_dict, linker_dict,
+        barcode_length, linker_length):
 
     # if valid, return (True)
     metrics = defaultdict(int)
@@ -270,8 +271,8 @@ def process_read(
                 if miss_length > 2:
                     metrics['Reads Unmapped too Short'] += 1
                     continue
-                seq_barcode = seq_barcode + "A" * miss_length                    
-        
+                seq_barcode = seq_barcode + "A" * miss_length
+
         # check linker
         if linker_length != 0:
             valid_linker = False
@@ -281,7 +282,7 @@ def process_read(
                     break
         else:
             valid_linker = True
-            
+
         if not valid_linker:
             metrics['Reads Unmapped Invalid Linker'] += 1
             continue
@@ -371,7 +372,7 @@ def gen_stat(df, stat_file):
         value = f'{format_number(count)}({round(percent * 100, 2)}%)'
         return value
 
-    df.loc[:,'value'] = df.loc[:,'count']
+    df.loc[:, 'value'] = df.loc[:, 'count']
     df.loc[~df['total_count'].isna(), 'value'] = df.loc[~df['total_count'].isna(), :].apply(
         add_percent, axis=1
     )
@@ -387,9 +388,9 @@ def get_read(library_id, library_path, read='1'):
     fq_list = ['fq', 'fastq']
     suffix_list = ["", ".gz"]
     read_pattern_list = [
-        f'{library_path}/*{library_id}*{read}.{fq_str}{suffix}' 
-        for read in read1_list 
-        for fq_str in fq_list 
+        f'{library_path}/*{library_id}*{read}.{fq_str}{suffix}'
+        for read in read1_list
+        for fq_str in fq_list
         for suffix in suffix_list
     ]
     fq_list = [glob.glob(read1_pattern) for read1_pattern in read_pattern_list]
@@ -469,8 +470,8 @@ job_end
 
 
 def merge_report(
-    fq_dict, steps, last_step, sjm_cmd,
-    sjm_order, logdir, conda, outdir, rm_files):    
+        fq_dict, steps, last_step, sjm_cmd,
+        sjm_order, logdir, conda, outdir, rm_files):
     step = "merge_report"
     steps_str = ",".join(steps)
     samples = ','.join(fq_dict.keys())
@@ -577,7 +578,7 @@ def report_prepare(outdir, **kwargs):
         json.dump(data, fh)
 
 
-def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')):
+def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID', 'CID')):
     vcf = pysam.VariantFile(vcf_file)
     df = pd.DataFrame(columns=[col.capitalize() for col in cols] + infos)
     rec_dict = {}
@@ -587,7 +588,7 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')):
             rec_dict[col.capitalize()] = getattr(rec, col)
             if col == 'alleles':
                 rec_dict['Alleles'] = '-'.join(rec_dict['Alleles'])
-                
+
         for info in infos:
             rec_dict[info] = rec.info[info]
 
@@ -597,12 +598,12 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')):
         rec_dict['GT'] = '/'.join(rec_dict['GT'])
         '''
 
-        df = df.append(pd.Series(rec_dict),ignore_index=True)
+        df = df.append(pd.Series(rec_dict), ignore_index=True)
     return df
 
 
 def parse_annovar(annovar_file):
-    df = pd.DataFrame(columns=['Gene','mRNA', 'Protein', 'COSMIC'])
+    df = pd.DataFrame(columns=['Gene', 'mRNA', 'Protein', 'COSMIC'])
     with open(annovar_file, 'rt') as f:
         index = 0
         for line in f:
@@ -632,7 +633,7 @@ def parse_annovar(annovar_file):
                     if change_attr.startswith('p.'):
                         protein = change_attr.strip('p.')
                 if not (mRNA, protein) in change_list:
-                    change_list.append((mRNA, protein)) 
+                    change_list.append((mRNA, protein))
             combine = [','.join(item) for item in list(zip(*change_list))]
             mRNA = combine[0]
             protein = combine[1]
@@ -763,6 +764,7 @@ def find_step_module(assay, step):
 
     return step_module
 
+
 def find_step_module_with_folder(assay, step):
     init_module = find_assay_init(assay)
     folder = ""
@@ -792,4 +794,4 @@ def sort_bam(input_bam, output_bam, threads=1):
 
 def index_bam(input_bam):
     cmd = f"samtools index {input_bam}"
-    subprocess.check_call(cmd, shell=True)
\ No newline at end of file
+    subprocess.check_call(cmd, shell=True)
diff --git a/celescope/vdj/__init__.py b/celescope/vdj/__init__.py
index 9038708c..fd57f4eb 100755
--- a/celescope/vdj/__init__.py
+++ b/celescope/vdj/__init__.py
@@ -3,4 +3,4 @@ __ASSAY__ = 'vdj'
 CHAINS = {
     "TCR": ["TRA", "TRB"],
     "BCR": ["IGH", "IGL", "IGK"],
-}
\ No newline at end of file
+}
diff --git a/celescope/vdj/mapping_vdj.py b/celescope/vdj/mapping_vdj.py
index dd811695..45776487 100755
--- a/celescope/vdj/mapping_vdj.py
+++ b/celescope/vdj/mapping_vdj.py
@@ -63,7 +63,7 @@ class Mapping_vdj(Step):
             'mixcr exportAlignments '
             f'{self.read2_vdjca} {self.alignments} '
             '-readIds --force-overwrite -vGene -dGene -jGene -cGene '
-            '-nFeature CDR3 -aaFeature CDR3 '            
+            '-nFeature CDR3 -aaFeature CDR3 '
         )
 
         Mapping_vdj.run_mixcr.logger.info(cmd)
@@ -71,7 +71,7 @@ class Mapping_vdj(Step):
 
     @utils.add_log
     def mixcr_summary(self, total_read, df_align):
-        
+
         align_read = df_align.shape[0]
         self.add_metric(
             name=f"{self.read_type} Mapped to Any VDJ Gene",
@@ -198,26 +198,25 @@ class Mapping_vdj(Step):
 
 @utils.add_log
 def mapping_vdj(args):
-    # TODO 
+    # TODO
     # add TCR or BCR prefix to distinguish them in html report summary; should improve
     step_name = f"{args.type}_mapping_vdj"
     mapping_vdj_obj = Mapping_vdj(args, step_name)
     mapping_vdj_obj.run()
 
 
-
 def get_opts_mapping_vdj(parser, sub_program):
     parser.add_argument("--type", help='TCR or BCR', required=True)
     parser.add_argument(
-        '--species', 
-        choices=['hs', 'mmu'], 
-        help='Default `hs`. `hs`(human) or `mmu`(mouse). ', 
+        '--species',
+        choices=['hs', 'mmu'],
+        help='Default `hs`. `hs`(human) or `mmu`(mouse). ',
         default='hs'
     )
     parser.add_argument("--not_consensus", action='store_true', help="Input fastq is not consensused.")
     if sub_program:
         parser.add_argument(
-            "--fq", 
+            "--fq",
             help="Required. Input fastq file.",
             required=True,
         )
diff --git a/celescope/vdj/multi_vdj.py b/celescope/vdj/multi_vdj.py
index 5d2a4415..d6257559 100755
--- a/celescope/vdj/multi_vdj.py
+++ b/celescope/vdj/multi_vdj.py
@@ -17,7 +17,6 @@ class Multi_vdj(Multi):
         )
         self.process_cmd(cmd, step, sample, m=15, x=self.args.thread)
 
-
     def count_vdj(self, sample):
         # count_vdj
         step = 'count_vdj'
@@ -33,11 +32,10 @@ class Multi_vdj(Multi):
         self.process_cmd(cmd, step, sample, m=8, x=self.args.thread)
 
 
-
 def main():
     multi = Multi_vdj(__ASSAY__)
     multi.run()
 
+
 if __name__ == '__main__':
     main()
-
-- 
Gitee


From 6acf5b48c704875b7e84b99a6593bf35e49dbe91 Mon Sep 17 00:00:00 2001
From: pigraul <raulee@outlook.com>
Date: Thu, 24 Jun 2021 09:22:30 +0800
Subject: [PATCH 74/96] add Dynaseq

---
 celescope/__init__.py                         |   1 +
 celescope/dynaseq/Generate_T_C_matrix.R       |  51 +++
 celescope/dynaseq/__init__.py                 |  23 ++
 celescope/dynaseq/analysis.py                 | 104 ++++++
 celescope/dynaseq/conversion.py               | 267 ++++++++++++++
 celescope/dynaseq/multi_dynaseq.py            |  63 ++++
 celescope/dynaseq/replace_tsne.py             | 177 ++++++++++
 celescope/dynaseq/replacement.py              | 331 ++++++++++++++++++
 celescope/dynaseq/star.py                     | 163 +++++++++
 celescope/dynaseq/subsitution.py              | 208 +++++++++++
 celescope/templates/html/dynaseq/base.html    | 156 +++++++++
 .../html/dynaseq/replace_tsne_summary.html    |  54 +++
 .../html/dynaseq/replacement_summary.html     |   3 +
 .../html/dynaseq/subsitution_summary.html     |  13 +
 celescope/tools/multi.py                      |  10 +-
 celescope/tools/utils.py                      |   7 +-
 docs/dynaseq/analysis.md                      |  51 +++
 docs/dynaseq/conversion.md                    |  26 ++
 docs/dynaseq/replace_tsne.md                  |  30 ++
 docs/dynaseq/replacement.md                   |  36 ++
 docs/dynaseq/star.md                          |  56 +++
 docs/dynaseq/subsitution.md                   |  20 ++
 22 files changed, 1844 insertions(+), 6 deletions(-)
 create mode 100755 celescope/dynaseq/Generate_T_C_matrix.R
 create mode 100755 celescope/dynaseq/__init__.py
 create mode 100755 celescope/dynaseq/analysis.py
 create mode 100755 celescope/dynaseq/conversion.py
 create mode 100755 celescope/dynaseq/multi_dynaseq.py
 create mode 100755 celescope/dynaseq/replace_tsne.py
 create mode 100755 celescope/dynaseq/replacement.py
 create mode 100755 celescope/dynaseq/star.py
 create mode 100755 celescope/dynaseq/subsitution.py
 create mode 100755 celescope/templates/html/dynaseq/base.html
 create mode 100644 celescope/templates/html/dynaseq/replace_tsne_summary.html
 create mode 100644 celescope/templates/html/dynaseq/replacement_summary.html
 create mode 100644 celescope/templates/html/dynaseq/subsitution_summary.html
 create mode 100644 docs/dynaseq/analysis.md
 create mode 100644 docs/dynaseq/conversion.md
 create mode 100644 docs/dynaseq/replace_tsne.md
 create mode 100644 docs/dynaseq/replacement.md
 create mode 100644 docs/dynaseq/star.md
 create mode 100644 docs/dynaseq/subsitution.md

diff --git a/celescope/__init__.py b/celescope/__init__.py
index 28a2454c..906341fc 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -16,6 +16,7 @@ ASSAY_DICT = {
     'tag': 'Single-cell tag',
     'citeseq': 'Single Cell CITE-Seq',
     'tcr_fl': 'Single Cell full length TCR',
+    'dynaseq': 'Single Cell Dynaseq'
 }
 
 ROOT_PATH = os.path.dirname(__file__)
diff --git a/celescope/dynaseq/Generate_T_C_matrix.R b/celescope/dynaseq/Generate_T_C_matrix.R
new file mode 100755
index 00000000..d97535f7
--- /dev/null
+++ b/celescope/dynaseq/Generate_T_C_matrix.R
@@ -0,0 +1,51 @@
+args <- commandArgs(T)
+
+require("reshape2")
+require("tidyr")
+require("dplyr")
+require("Matrix")
+
+my.count1 <- read.table(args[1],h=F)
+
+my.count1$V1 <- as.character(my.count1$V1)
+my.count1$gene <- gsub("--C","",my.count1$V1)
+my.count1$gene <- gsub("--T","",my.count1$gene)
+cells.keep <- my.count1 %>% dplyr::distinct(V2,V3,gene) %>% group_by(V2) %>% dplyr::summarize(count=n()) %>% arrange(desc(count)) %>% .$V2 %>% as.character 
+
+inds <- as.numeric(args[2])
+if (length(cells.keep) > inds) {
+ cells.keep2 <- head(cells.keep,inds)
+}else{ cells.keep2 <- cells.keep}
+
+my.count1 <- my.count1 %>% filter(V2 %in% cells.keep2) %>% droplevels
+my.count1$type <- "C"
+my.count1[grep("--T",my.count1$V1),]$type <- "T"
+my.count2 <- dcast(my.count1,gene+V2+V3 ~ type, value.var = "V4")
+my.count2[is.na(my.count2)] <- 0
+
+if(! "C" %in% colnames(my.count2))
+{
+  my.count2$C <- 0;
+}
+if (ncol(my.count2) !=5) {
+    stop("Error! Please verify the count data frame!\n");
+}
+my.count2 <- my.count2 %>% arrange(gene,V2,V3,C,T)
+my.count2 %>% mutate(type = ifelse(C > 0,"C","T")) -> my.count2 
+my.count3 <- my.count2 %>% group_by(gene,type,V2) %>% dplyr::summarize(count=n())
+my.count3$gene2 <- paste(my.count3$gene,my.count3$type,sep="--")
+my.count3$V2 <- as.factor(my.count3$V2)
+my.count3$gene2 <- as.factor(my.count3$gene2)
+data.sparse = sparseMatrix(as.integer(my.count3$gene2), as.integer(my.count3$V2), x = my.count3$count)
+colnames(data.sparse) = levels(my.count3$V2)
+rownames(data.sparse) = levels(my.count3$gene2)
+ord <- sort(colSums(data.sparse),decreasing = T)
+data.sparse <- data.sparse[,names(ord)]
+saveRDS(data.sparse,file=args[3])
+outtsv<-paste(args[3],"tsv", sep = ".")
+write.table(as.matrix(data.sparse), file = outtsv, sep = "\t", quote = F, row.names = T)
+
+
+
+
+
diff --git a/celescope/dynaseq/__init__.py b/celescope/dynaseq/__init__.py
new file mode 100755
index 00000000..6ed6df1f
--- /dev/null
+++ b/celescope/dynaseq/__init__.py
@@ -0,0 +1,23 @@
+__STEPS__ = [
+    'sample',
+    'barcode',
+    'cutadapt',
+    'star',
+    "featureCounts",
+    "count",
+    'analysis',
+    'conversion',
+    'subsitution',
+    'replacement',
+    'replace_tsne']
+
+__ASSAY__ = 'dynaseq'
+
+# m: memory 
+# x: thread
+RESOURCE = {
+    'sample': {'m':1, 'x':1},
+    'barcode': {'m':5, 'x':1},
+    'cutadapt': {'m':5, 'x':1},
+    'star': {'m':30, 'x':1},
+}
diff --git a/celescope/dynaseq/analysis.py b/celescope/dynaseq/analysis.py
new file mode 100755
index 00000000..ad5ec8f8
--- /dev/null
+++ b/celescope/dynaseq/analysis.py
@@ -0,0 +1,104 @@
+import pandas as pd
+
+from celescope.tools.analysis_mixin import AnalysisMixin
+from celescope.tools.step import Step
+from celescope.tools.utils import add_log, get_id_name_dict, s_common
+
+
+@add_log
+def generate_matrix(gtf_file, matrix_file):
+
+    id_name = get_id_name_dict(gtf_file)
+    matrix = pd.read_csv(matrix_file, sep="\t")
+
+    gene_name_col = matrix.geneID.apply(lambda x: id_name[x])
+    matrix.geneID = gene_name_col
+    matrix = matrix.drop_duplicates(subset=["geneID"], keep="first")
+    matrix = matrix.dropna()
+    matrix = matrix.rename({"geneID": ""}, axis='columns')
+    return matrix
+
+
+class Analysis_dynaseq(Step, AnalysisMixin):
+    """
+    Features
+    - Cell clustering with Seurat.
+
+    - Calculate the marker gene of each cluster.
+
+    - Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
+
+    Output
+    - `markers.tsv` Marker genes of each cluster.
+
+    - `tsne_coord.tsv` t-SNE coordinates and clustering information.
+
+    - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
+    parameter is provided. The result contains 3 files:
+	- `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
+    it means that the given marker is not enough to identify the cluster.
+	- `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
+	- `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
+    """
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        AnalysisMixin.__init__(self, args)
+        self.matrix_file = args.matrix_file
+        self.genomeDir = args.genomeDir
+        self.type_marker_tsv = args.type_marker_tsv
+        self.auto_assign_bool = False
+        self.save_rds = args.save_rds
+        if args.type_marker_tsv and args.type_marker_tsv != 'None':
+            self.auto_assign_bool = True
+            self.save_rds = True
+
+    def run(self):
+        self.seurat(self.matrix_file, self.save_rds, self.genomeDir)
+        if self.auto_assign_bool:
+            self.auto_assign(self.type_marker_tsv)
+        self.run_analysis()
+        self.add_data_item(cluster_tsne=self.cluster_tsne)
+        self.add_data_item(gene_tsne=self.gene_tsne)
+        self.add_data_item(table_dict=self.table_dict)
+
+        self.clean_up()
+
+
+@add_log
+def analysis(args):
+
+    step_name = "analysis"
+    ana = Analysis_dynaseq(args, step_name)
+    ana.run()
+
+
+def get_opts_analysis(parser, sub_program):
+
+    parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True)
+    parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.')
+    parser.add_argument(
+        '--type_marker_tsv', 
+        help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```"""
+    )
+    if sub_program:
+        parser.add_argument(
+            '--matrix_file', 
+            help='Required. Matrix_10X directory from step count.', 
+            required=True,
+        )
+        parser = s_common(parser)
+
+
+
diff --git a/celescope/dynaseq/conversion.py b/celescope/dynaseq/conversion.py
new file mode 100755
index 00000000..fc300d66
--- /dev/null
+++ b/celescope/dynaseq/conversion.py
@@ -0,0 +1,267 @@
+#!/usr/bin/env python
+# v1.0
+
+import pysam
+import os
+import subprocess
+import numpy as np
+import pandas as pd
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+
+class Conversion(Step):
+    """
+    Features
+    - Get conversion pos in each read.
+	- Get snp info. 
+
+    Output
+    - `{sample}.PosTag.bam` Bam file with conversion info.
+    - `{sample}.PosTag.csv` SNP info in csv format.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        # input files
+        self.ifile = os.path.join(args.outdir, args.sample+'.bam')
+        self.sample = args.sample
+        self.strandednessfile = args.strand
+        self.inbam = args.bam
+        self.bcfile = args.cell
+        self.outdir = args.outdir
+        self.thread = args.thread
+
+        # output files
+        self.outfile_bam = os.path.join(args.outdir, args.sample+'.PosTag.bam')
+        self.outfile_csv = os.path.join(args.outdir, args.sample+'.PosTag.csv')
+
+    @utils.add_log
+    def run(self):
+        ##Filter and sort
+        self.fltSort(self.inbam,self.ifile,self.bcfile,self.thread)
+        cmd=['samtools index',self.ifile]
+        self.run_cmd(cmd)
+
+        ##Adding tags
+        self.addTags(self.ifile,self.outfile_bam,self.strandednessfile)
+        cmd=['samtools index',self.outfile_bam]
+        self.run_cmd(cmd)
+
+        #Obtaining conversion positions
+        bam = pysam.AlignmentFile(self.outfile_bam, 'rb')
+        ContigLocs, AnnoteLocs=self.CountConvperPos(bam)
+
+        #Obtaining coverage over conversion position   
+        ConvsPerPos,CoverofPosWithConvs = self.CountReadConverPerConvPos(bam,ContigLocs)
+        A=self.ExportasVcf(ConvsPerPos,CoverofPosWithConvs,AnnoteLocs)
+        A['sample']  = self.sample
+        #Saving result
+        A.to_csv(self.outfile_csv)
+        bam.close()
+
+        cmd=['rm', self.ifile]
+        self.run_cmd(cmd)
+        cmd=['rm', self.ifile+'.bai']
+        self.run_cmd(cmd)
+
+    def run_cmd(self,cmd):
+        subprocess.call(' '.join(cmd),shell=True)
+
+    @utils.add_log
+    def CountConvperPos(self,bamfile):
+        ContigLocs={}
+        AnnoteLocs={}
+        for read in bamfile.fetch():
+            try:
+                if read.get_tag('ST')=='+':
+                    locs=read.get_tag('TL')
+                else:
+                    locs=read.get_tag('AL')
+                if locs[0]!=0:
+                    if read.reference_name in ContigLocs:
+                        ContigLocs[read.reference_name].extend(locs)
+                    else:
+                        ContigLocs[read.reference_name] = list(locs)
+                    if read.reference_name not in AnnoteLocs:
+                        for i,each in enumerate(locs):
+                            if i == 0:
+                                AnnoteLocs[read.reference_name] = { each :read.get_tag('XT')}
+                            else:
+                                AnnoteLocs[read.reference_name][each] = read.get_tag('XT')
+                    else:
+                        for i,each in enumerate(locs):
+                            if each not in AnnoteLocs[read.reference_name]:
+                                AnnoteLocs[read.reference_name][each] = read.get_tag('XT')
+            except (ValueError,KeyError):
+                continue
+        return ContigLocs, AnnoteLocs
+
+    @utils.add_log
+    def CountReadConverPerConvPos(self,bam,ContigLocs):
+        ConvsPerPos={}
+        CoverofPosWithConvs={}
+        for key in ContigLocs.keys():
+            ContigLocs[key]=sorted(ContigLocs[key])
+            ConvsPerPos[key]={}
+            k=0
+            current=ContigLocs[key][k]
+            k+=1
+            nextone=ContigLocs[key][k]
+            while k < len(ContigLocs[key])-1:
+                ConvsPerPos[key][current]=1
+                while current == nextone and k < len(ContigLocs[key])-1:
+                    k+=1
+                    nextone=ContigLocs[key][k]
+                    ConvsPerPos[key][current]+=1
+                current = nextone
+                if k < len(ContigLocs[key])-1:
+                    k+=1
+                    nextone=ContigLocs[key][k]
+
+            CoverofPosWithConvs[key]={}
+            for key2 in ConvsPerPos[key].keys():
+                try:
+                    CoverofPosWithConvs[key][key2]=bam.count(key,key2,key2+1)
+                except ValueError:
+                    continue
+        return ConvsPerPos,CoverofPosWithConvs
+
+    @utils.add_log
+    def ExportasVcf(self,ConvsPerPos,CoverofPosWithConvs, AnnoteLocs):
+        #Chrom, Pos , ConvsPerPs, CoverofPosWithConvs
+        Outputdf =pd.DataFrame(columns=['pos2','convs','covers','chrom','posratio'])
+        for key in ConvsPerPos.keys():
+            df=pd.DataFrame.from_dict(ConvsPerPos[key], orient='index')
+            df1=pd.DataFrame.from_dict(CoverofPosWithConvs[key], orient='index')
+            df.index.name='pos'
+            df1.index.name='pos'
+            df.columns = ['convs']
+            df1.columns = ['covers']
+            df2=df.join(df1)
+            df2['pos2'] = df2.index
+            df2.index = np.arange(df2.shape[0])
+            df2['chrom']=np.repeat(key,df2.shape[0])
+            df2['posratio']=df2['convs']/df2['covers']
+            df3=pd.DataFrame.from_dict(AnnoteLocs[key], orient='index')
+            df3.columns = ['gene_id']
+            df2=df2.join(df3, on='pos2')
+            Outputdf=Outputdf.append(df2)
+        return Outputdf.reset_index(drop=True)
+
+    def createTag(self,d):
+        return ''.join([''.join(key) + str(d[key]) + ';' for key in d.keys()])[:-1]
+
+
+    def convInRead(self, read, qual = 20):
+        specific_conversions = {}
+        total_content = {'a' : 0, 'c' : 0, 'g' : 0, 't' : 0}
+        specific_conversions[('c', 'A')] = 0
+        specific_conversions[('g', 'A')] = 0
+        specific_conversions[('t', 'A')] = 0
+        specific_conversions[('a', 'C')] = 0
+        specific_conversions[('g', 'C')] = 0
+        specific_conversions[('t', 'C')] = 0
+        specific_conversions[('a', 'G')] = 0
+        specific_conversions[('c', 'G')] = 0
+        specific_conversions[('t', 'G')] = 0
+        specific_conversions[('a', 'T')] = 0
+        specific_conversions[('c', 'T')] = 0
+        specific_conversions[('g', 'T')] = 0
+        specific_conversions[('a', 'N')] = 0
+        specific_conversions[('c', 'N')] = 0
+        specific_conversions[('g', 'N')] = 0
+        specific_conversions[('t', 'N')] = 0
+
+        tC_loc = []
+        aG_loc = []
+
+        try:
+            refseq = read.get_reference_sequence().lower()
+        except (UnicodeDecodeError):
+            refseq=''
+
+        for base in total_content.keys():
+            total_content[base] += refseq.count(base)
+        for pair in read.get_aligned_pairs(with_seq=True):
+            try:
+                if pair[0] is not None and pair[1] is not None and pair[2] is not None:
+                    if str(pair[2]).islower() and not read.query_qualities[pair[0]] < qual:
+                        specific_conversions[(pair[2],read.seq[pair[0]])] += 1
+                        if (pair[2],read.seq[pair[0]]) == ('t', 'C'):
+                            tC_loc.append(pair[1])
+                        if (pair[2],read.seq[pair[0]]) == ('a', 'G'):
+                            aG_loc.append(pair[1])
+            except (UnicodeDecodeError, KeyError):
+                continue
+        SC_tag = self.createTag(specific_conversions)
+        TC_tag = self.createTag(total_content)
+        
+        if len(tC_loc) == 0:
+            tC_loc.append(0)
+        if len(aG_loc) == 0:
+            aG_loc.append(0)
+        return SC_tag, TC_tag, tC_loc, aG_loc
+
+    @utils.add_log
+    def addTags(self,bamfilename, outputname,strandednessfile):
+        bamfile = pysam.AlignmentFile(bamfilename, 'rb')
+        mod_bamfile = pysam.AlignmentFile(outputname, mode='wb',template=bamfile)
+        strandedness = pd.read_csv(strandednessfile, header=None, index_col=0)
+        for read in bamfile.fetch():
+            try:
+                tags = self.convInRead(read)
+                read.set_tag('SC',tags[0],'Z')
+                read.set_tag('TC',tags[1],'Z')
+                read.set_tag('TL',tags[2])
+                read.set_tag('AL',tags[3])
+                read.set_tag('ST',strandedness.loc[read.get_tag('XT')][1])
+                mod_bamfile.write(read)
+            except (ValueError,KeyError):
+                continue
+
+        bamfile.close()
+        mod_bamfile.close()
+
+    @utils.add_log
+    def fltSort(self,bamfilename, outfile_bam,cellfile, thread=8):
+        bamfile = pysam.AlignmentFile(bamfilename, 'rb')
+        mod_bamfile = pysam.AlignmentFile(outfile_bam, mode='wb',template=bamfile)
+        cells={}
+        with open(cellfile) as f:
+            for i in f:
+                cells[i.strip()] = 1
+        for read in bamfile.fetch(until_eof=True):
+            try:
+                if not read.has_tag('GX'): continue
+                if read.get_tag("CB") not in cells: continue
+                mod_bamfile.write(read)
+            except (ValueError,KeyError):
+                continue
+        bamfile.close()
+        mod_bamfile.close()
+
+        cmd=['samtools sort -@',str(thread), '-o', outfile_bam+'.bam',outfile_bam]
+        self.run_cmd(cmd)
+        cmd=['mv',outfile_bam+'.bam',outfile_bam]
+        self.run_cmd(cmd)
+
+
+
+
+
+@utils.add_log
+def conversion(args):
+
+    step_name = "conversion"
+    conversion_obj = Conversion(args, step_name)
+    conversion_obj.run()
+
+def get_opts_conversion(parser, sub_program):
+    parser.add_argument('--strand', help='gene strand file', required=True)
+    if sub_program:
+        parser.add_argument("--bam", help='featureCount bam', required=True)
+        parser.add_argument("--cell", help='barcode cell list', required=True)
+        parser = s_common(parser)    
+    return parser
+
diff --git a/celescope/dynaseq/multi_dynaseq.py b/celescope/dynaseq/multi_dynaseq.py
new file mode 100755
index 00000000..ed6795f5
--- /dev/null
+++ b/celescope/dynaseq/multi_dynaseq.py
@@ -0,0 +1,63 @@
+from celescope.dynaseq.__init__ import __ASSAY__
+from celescope.tools.multi import Multi
+
+
+class Multi_dynaseq(Multi):
+
+    def conversion(self, sample):
+        step = 'conversion'
+        bam = f'{self.outdir_dic[sample]["featureCounts"]}/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam'
+        cell = f'{self.outdir_dic[sample]["count"]}/{sample}_matrix_10X/barcodes.tsv'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--bam {bam} '
+            f'--cell {cell} '
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=1)
+
+    def subsitution(self, sample):
+        step = 'subsitution'
+        bam = f'{self.outdir_dic[sample]["conversion"]}/{sample}.PosTag.bam'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--bam {bam} '
+        )
+        self.process_cmd(cmd, step, sample, m=1, x=1)
+
+
+    def replacement(self, sample):
+        step = 'replacement'
+        bam = f'{self.outdir_dic[sample]["conversion"]}/{sample}.PosTag.bam'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--bam {bam} '
+            f'--bg {self.col5_dict[sample]} '
+        )
+        self.process_cmd(cmd, step, sample, m=10, x=1)
+    
+
+    def replace_tsne(self, sample):
+        step = 'replace_tsne'
+        tsne_file = f'{self.outdir_dic[sample]["analysis"]}/{sample}_tsne_coord.tsv'
+        mat_file = f'{self.outdir_dic[sample]["replacement"]}/{sample}.fraction_of_newRNA_matrix.txt'
+        rep_file = f'{self.outdir_dic[sample]["replacement"]}/{sample}.fraction_of_newRNA_per_cell.txt'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line} '
+            f'--tsne {tsne_file} '
+            f'--mat {mat_file} '
+            f'--rep {rep_file} '
+        )
+        self.process_cmd(cmd, step, sample, m=1, x=1)
+
+
+def main():
+    multi = Multi_dynaseq(__ASSAY__)
+    multi.run()
+
+if __name__ == '__main__':
+    main()
+
diff --git a/celescope/dynaseq/replace_tsne.py b/celescope/dynaseq/replace_tsne.py
new file mode 100755
index 00000000..a0b84eb1
--- /dev/null
+++ b/celescope/dynaseq/replace_tsne.py
@@ -0,0 +1,177 @@
+#!/bin/env python
+# coding=utf8
+
+import os
+import pandas as pd
+import plotly
+import plotly.graph_objects as go
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+
+
+class Replace_tsne(Step):
+    """
+    Features
+    - Replace rate in each cluster
+    - Top replace genes in each cluster
+
+    Output
+    - `{sample}.rep_in_tsne.txt` Replace rate in each cluster.
+    - `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        # input files
+        self.sample = args.sample
+        self.tsnefile = args.tsne 
+        self.matfile = args.mat
+        self.repfile = args.rep
+        self.mincell = args.mincell
+        self.topgene = args.topgene
+        # output files
+        self.outdot = os.path.join(self.outdir, self.sample+'.rep_in_tsne.txt')
+        self.outtbl = os.path.join(self.outdir, self.sample+'.rep_in_tsne_top10.txt')
+
+    @utils.add_log
+    def run(self):
+        # rep in cells in cluster
+        self.dot_tsne(self.repfile,self.tsnefile,self.outdot)
+        div_item = self.tsne_plot(self.outdot)
+        # high rep gene in each cluster
+        self.top_gene_cluster(self.matfile,self.tsnefile,self.outtbl,self.mincell,self.topgene)
+        tbltxt = pd.read_csv(self.outtbl,header=0,sep="\t")
+        tbldiv = self.tsne_table(tbltxt)
+
+        # report
+        self.report_prepare(div_item, tbldiv)
+        self.clean_up()
+
+
+
+    @utils.add_log
+    def dot_tsne(self,repfile,tsnefile,outfile):
+        cells = {}
+        with open(repfile, 'r') as f:
+            for i in f:
+                ii = i.strip().split()
+                cells[ii[0]] = ii[1]
+
+        outf = open(outfile, 'w')
+        outf.write("Cell\ttSNE_1\ttSNE_2\tCluster\tratio\n")
+        with open(tsnefile, 'r') as f:
+            f.readline()
+            for i in f:
+                ii = i.strip().split()
+                if ii[0] in cells:
+                    outl = '\t'.join(ii[0:4])+'\t'+cells[ii[0]]+'\n'
+                else:
+                    outl = '\t'.join(ii[0:4])+'\t0'+'\n'
+                outf.write(outl)
+        outf.close()
+
+    @utils.add_log
+    def tsne_plot(self,txt):
+        df = pd.read_table(txt)
+        df.sort_values(by="ratio")
+        newtitle="t-SNE plot Colored by RNA Turn-over rate"
+
+        fig = go.Figure()
+        fig.add_trace(go.Scatter(x=df['tSNE_1'], y=df['tSNE_2'], mode='markers',
+                            marker_opacity=0.9,marker_size=4,marker_color=df['ratio'],
+                            marker_colorscale="PuBu", marker_showscale=True,
+        ))
+        fig.update_layout(height=600, width=600,title_text=newtitle)
+        fig.update_layout(plot_bgcolor = '#FFFFFF')
+        fig.update_xaxes(showgrid=False,linecolor='black', showline=True, ticks='outside',title_text='t-SNE1')
+        fig.update_yaxes(showgrid=False,linecolor='black', showline=True, ticks='outside',title_text='t-SNE2')
+
+        div = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
+
+        return div
+
+
+    def tsne_table(self,txt):
+        marker_gene_table = txt.to_html(
+                escape=False,
+                index=False,
+                table_id='replacement_table_cluster',
+                justify="center")
+
+        return marker_gene_table
+
+
+    def file_stat(self,infile,clu):
+        clus = list(set(clu.values()))
+        cluster = {}
+        for c in clus:
+            cluster[c] = {}
+        fn = open(infile,"r")
+        fnh = fn.readline().strip().split()
+        for i in fn:
+            ii = i.strip().split()
+            for j in range(1,len(ii)):
+                if ii[j] == 'NA': continue
+                if fnh[j] not in clu: continue
+                if ii[0] not in cluster[clu[fnh[j]]]:
+                    cluster[clu[fnh[j]]][ii[0]] = []
+                cluster[clu[fnh[j]]][ii[0]].append(float(ii[j]))
+        fn.close()
+        return cluster
+
+    def tsne_file(self,infile):
+        clu = {}
+        with open(infile) as f:
+            f.readline()
+            for i in f:
+                ii = i.strip().split()
+                clu[ii[0]] = ii[3]
+        return clu
+
+
+    @utils.add_log
+    def top_gene_cluster(self,matrix,tsnefile,outfile,mincell=5,topgene=10):
+        tsne = self.tsne_file(tsnefile)
+        cluster = self.file_stat(matrix,tsne)
+
+        w = open(outfile,'w')
+        w.write("cluster\tgene\tTurn-over_rate\tcells\n")
+        for c in cluster:
+            tmp = {}
+            for g in cluster[c]:
+                gt = sum(cluster[c][g]) / len(cluster[c][g])
+                tmp[g] = gt
+            sorttmp = sorted(tmp.items(), key=lambda item:item[1], reverse=True)
+            tmpn = 0
+            for x in sorttmp:
+                if len(cluster[c][x[0]]) < mincell: continue
+                tmpn += 1
+                if tmpn > topgene: break
+                w.write('cluster'+c+'\t'+x[0]+'\t'+str(x[1])+'\t'+str(len(cluster[c][x[0]]))+'\n')
+        w.close()
+
+
+    def report_prepare(self,outdiv, outable):
+        self.add_data_item(replace_tsne=outdiv)
+        self.add_data_item(replace_tsne_table=outable)
+
+
+@utils.add_log
+def replace_tsne(args):
+
+    step_name = "replace_tsne"
+    replace_tsne_obj = Replace_tsne(args, step_name)
+    replace_tsne_obj.run()
+
+def get_opts_replace_tsne(parser, sub_program):
+    if sub_program:
+        parser.add_argument('--tsne', help='tsne file', required=True)
+        parser.add_argument('--mat', help='matrix rep file', required=True)
+        parser.add_argument('--rep', help='cell rep file', required=True)
+        parser.add_argument('--mincell', type=int, default=5, help='turn-over in at least cells, default 5')
+        parser.add_argument('--topgene', type=int, default=10, help='top N genes,default 10')
+        parser = s_common(parser) 
+    return parser
+
diff --git a/celescope/dynaseq/replacement.py b/celescope/dynaseq/replacement.py
new file mode 100755
index 00000000..c8ac2a15
--- /dev/null
+++ b/celescope/dynaseq/replacement.py
@@ -0,0 +1,331 @@
+#!/bin/env python
+# coding=utf8
+
+import os
+import sys
+import subprocess
+import pandas as pd
+import pysam
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+toolsdir = os.path.dirname(__file__)
+
+
+class Replacement(Step):
+    """
+    Features
+    - Computes the replacement rates in each cell and gene.
+    - Boxplots for rates distribution.
+
+    Output
+    - `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi.
+    - `{sample}.new_matrix.tsv.gz` New RNA matrix.
+    - `{sample}.old_matrix.tsv.gz` Old RNA matrix.
+    - `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell.
+    - `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene.
+    - `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        # input files
+        self.outdir = args.outdir
+        self.sample = args.sample
+        self.bam_file = args.bam
+        self.snp_file = args.bg
+        self.bg_cov = args.bg_cov
+        self.cell_keep = args.cell_keep
+        # output files
+        self.outread = os.path.join(self.outdir, self.sample+'.corrected_gene_cell_UMI_read.txt')
+        self.outrds = os.path.join(self.outdir, self.sample+'.TC_matrix.rds')
+        self.outpre = os.path.join(self.outdir,self.sample)
+    
+    @utils.add_log
+    def run(self):
+        # get backgroud snp
+        bg = self.background_snp(self.snp_file,self.bg_cov)
+        # get reads with TC
+        self.extract_dem(self.bam_file,self.outread,bg)
+        # run_R
+        self.generate_TC_matrix(self.outread, self.outrds, self.cell_keep)
+
+        # split to New and Old Matrix
+        totMat = self.outrds+'.tsv'
+        new_mat = self.outpre+'.new_matrix.tsv'
+        old_mat = self.outpre+'.old_matrix.tsv'
+        con_mat = self.outpre+'.NvsO_matrix.tsv'
+        self.split_matrix(totMat,self.outpre)
+        
+        # replacement stat
+        self.replacment_stat(con_mat,self.outpre)
+        # plot
+        div_item = self.replacment_plot(self.outpre)
+
+        # report
+        self.report_prepare(div_item)
+        self.clean_up()
+
+        # clean
+        cmd=['rm', self.outread]
+        self.run_cmd(cmd)
+        cmd=['rm', self.outrds+'.tsv']
+        self.run_cmd(cmd)
+        cmd=['rm', con_mat]
+        self.run_cmd(cmd)
+        cmd=['gzip', new_mat]
+        self.run_cmd(cmd)
+        cmd=['gzip', old_mat]
+        self.run_cmd(cmd)
+
+    def run_cmd(self,cmd):
+        subprocess.call(' '.join(cmd),shell=True)
+
+    @utils.add_log
+    def extract_dem(self,bam,outfile,bg):
+        bamfile = pysam.AlignmentFile(bam, 'rb')
+        countdict = {}
+        for read in bamfile.fetch():
+            try:
+                chro = read.reference_name
+                cb = read.get_tag('CB')
+                ub = read.get_tag('UB')
+                if not read.has_tag('GN'): continue
+                gene = read.get_tag('GN')
+
+                if read.get_tag('ST') == '+':
+                    stag = read.get_tag('TL')
+                else:
+                    stag = read.get_tag('AL')
+                if len(stag)==1 and stag[0]==0:
+                    gene += '--T'
+                else:
+                    fcount = 0
+                    for si in range(0,len(stag)):
+                        pos = chro + '_' + str(stag[si])
+                        if pos in bg:
+                            fcount += 1
+                    if fcount == len(stag):
+                        gene += '--T'
+                    else:
+                        gene += '--C'
+                
+                readinfo = '\t'.join([gene,cb,ub])
+                if readinfo not in countdict:
+                    countdict[readinfo] = 1
+                else:
+                    countdict[readinfo] += 1
+
+            except (ValueError,KeyError):
+                continue
+        bamfile.close()
+
+        out1 = open(outfile,'w')
+        for rid in countdict:
+            out1.write(rid+'\t'+str(countdict[rid])+'\n')
+        out1.close()
+
+    @utils.add_log
+    def background_snp(self,bgfile,cov=1):
+        outdict = {}
+        if bgfile.endswith('.csv'):
+            with open(bgfile) as f:
+                f.readline()
+                for i in f:
+                    ii = i.strip().split(',')
+                    if int(ii[2])<cov: continue
+                    chr_pos = ii[1]+'_'+ii[5]
+                    outdict[chr_pos] = 1
+        elif bgfile.endswith('.vcf'):
+            from pysam import VariantFile
+            bcf_in = VariantFile(bgfile)
+            for rec in bcf_in.fetch():
+                try:
+                    chrom, pos = rec.chrom, rec.pos
+                    chr_pos = chrom+'_'+str(pos)
+                    outdict[chr_pos] = 1
+                except (ValueError,KeyError):
+                    continue
+            bcf_in.close()
+
+        else:
+            try:
+                sys.exit(1)
+            except SystemExit:
+                print('Background snp file format cannot be recognized! Only csv or vcf format.')
+            finally:
+                print('Background snp file format cannot be recognized! Only csv or vcf format.')
+        return outdict
+
+
+    @utils.add_log
+    def generate_TC_matrix(self,read, outrds, cell=100000):
+        app = toolsdir + "/Generate_T_C_matrix.R"
+        cmd = (
+            f'Rscript {app} {read} {cell} {outrds}'
+        )
+        os.system(cmd)
+
+    @utils.add_log
+    def split_matrix(self,mat,outpre):
+        outnew = open(outpre+'.new_matrix.tsv', 'w')
+        outold = open(outpre+'.old_matrix.tsv', 'w')
+        con_mat = open(outpre+'.NvsO_matrix.tsv', 'w')
+        infile = open(mat, 'r')
+
+        tmph = infile.readline().strip().split()
+        fill_na = ['0'] * len(tmph)
+        tmph.insert( 0, 'geneID')
+        outnew.write('\t'.join(tmph)+'\n')
+        outold.write('\t'.join(tmph)+'\n')
+        con_mat.write('\t'.join(tmph)+'\n')
+
+        genes = {}
+        for i in infile:
+            ii = i.strip().split()
+            gt = ii[0].split('--')
+            ii[0] = gt[0]
+            if gt[0] not in genes:
+                genes[gt[0]] = [0,[],[]]
+            if gt[1] == 'C':
+                genes[gt[0]][0] += 1
+                genes[gt[0]][1] = ii[1:]
+                outnew.write('\t'.join(ii)+'\n')
+            elif gt[1] == 'T':
+                genes[gt[0]][0] += 2
+                genes[gt[0]][2] = ii[1:]
+                outold.write('\t'.join(ii)+'\n')
+        
+        for gi in genes:
+            con_mat.write(gi)
+            if genes[gi][0]==3:
+                for ci in range(len(genes[gi][1])):
+                    con_mat.write('\t'+genes[gi][1][ci]+':'+genes[gi][2][ci])
+                con_mat.write('\n')
+            elif genes[gi][0]==1:
+                outold.write(gi+'\t'+'\t'.join(fill_na)+'\n')
+                for ci in range(len(genes[gi][1])):
+                    con_mat.write('\t'+genes[gi][1][ci]+':'+'0')
+                con_mat.write('\n')
+            elif genes[gi][0]==2:
+                outnew.write(gi+'\t'+'\t'.join(fill_na)+'\n')
+                for ci in range(len(genes[gi][2])):
+                    con_mat.write('\t'+'0'+':'+genes[gi][2][ci])
+                con_mat.write('\n')
+
+        outnew.close()
+        outold.close()
+        con_mat.close()
+        infile.close()
+
+
+    @utils.add_log
+    def replacment_stat(self,inmat,outpre,mincell=10,mingene=10):
+
+        outcell = open(outpre+'.fraction_of_newRNA_per_cell.txt', 'w')
+        outgene = open(outpre+'.fraction_of_newRNA_per_gene.txt', 'w')
+        outmat = open(outpre+'.fraction_of_newRNA_matrix.txt', 'w')
+
+        cells = {}
+        genes = {}
+        mats = {}
+        with open(inmat) as f:
+            hh = f.readline().strip().split()
+            outmat.write('\t'.join(hh)+'\n')
+            for h in hh[1:]:
+                cells[h] = [[],[]]
+            for i in f:
+                ii = i.strip().split()
+                mats[ii[0]] = []
+                genes[ii[0]] = [[],[]]
+                for xi in range(1,len(ii)):
+                    xx = [int(x) for x in ii[xi].split(':')]                 
+                    if sum(xx) == 0:
+                        tmpf = 'NA'                   
+                    else:
+                        tmpf = float(xx[0])/(xx[0]+xx[1])
+                    mats[ii[0]].append(str(tmpf))
+                    if sum(xx)<2:
+                        continue                
+                    cells[hh[xi]][0].append(float(xx[0]))
+                    cells[hh[xi]][1].append(xx[1])
+                    genes[ii[0]][0].append(float(xx[0]))
+                    genes[ii[0]][1].append(xx[1])
+
+        for ci in cells:
+            if len(cells[ci][0])<mincell: continue
+            cfloat = sum(cells[ci][0])/(sum(cells[ci][0])+sum(cells[ci][1]))
+            outcell.write(ci+'\t'+str(cfloat)+'\n')
+        
+        for gi in genes:
+            if len(genes[gi][0])<mingene: continue
+            gfloat = sum(genes[gi][0])/(sum(genes[gi][0])+sum(genes[gi][1]))
+            outgene.write(gi+'\t'+str(gfloat)+'\n')
+
+        for mi in mats:
+            outmat.write(mi+'\t'+'\t'.join(mats[mi])+'\n')
+
+        outcell.close()
+        outgene.close()
+        outmat.close()
+
+
+    @utils.add_log
+    def replacment_plot(self,sample):
+        import plotly
+        import plotly.graph_objects as go
+        from plotly.subplots import make_subplots
+
+        outpre = os.path.basename(sample)
+        df1 = pd.read_table(sample+'.fraction_of_newRNA_per_gene.txt', header=None)
+        df1.columns = ['gene', 'per']
+        df = pd.read_table(sample+'.fraction_of_newRNA_per_cell.txt', header=None)
+        df.columns = ['gene', 'per']
+
+        fig = make_subplots(rows=1, cols=2)
+        fig.add_trace(
+            go.Violin(y=df1['per'], box_visible=True, line_color='black',
+                meanline_visible=True, fillcolor='#1f77b4', opacity=0.6, x0=outpre) ,
+            row=1, col=1
+        )
+        fig.add_trace(
+            go.Violin(y=df['per'], box_visible=True, line_color='black',
+                meanline_visible=True, fillcolor='#ff7f0e', opacity=0.6, x0=outpre) ,
+            row=1, col=2
+        )
+
+        fig.update_layout(yaxis_zeroline=True,  showlegend=False)
+        fig.update_layout(plot_bgcolor = '#FFFFFF')
+        fig.update_xaxes(showgrid=False, linecolor='black', showline=True, ticks=None)
+        fig.update_yaxes(showgrid=False, linecolor='black', showline=True, ticks='outside',title_text="Fraction new RNA (per gene)",row=1, col=1, rangemode="tozero")
+        fig.update_yaxes(showgrid=False, linecolor='black', showline=True, ticks='outside',title_text="Fraction new RNA (per cell)",row=1, col=2, rangemode="tozero")
+
+        div = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
+
+        return div
+
+    def report_prepare(self,outdiv):
+        self.add_data_item(replacement=outdiv)
+
+
+@utils.add_log
+def replacement(args):
+
+    step_name = "replacement"
+    replacement_obj = Replacement(args, step_name)
+    replacement_obj.run()
+
+
+def get_opts_replacement(parser, sub_program):
+    parser.add_argument('--bg_cov', type=int, default=1, help='background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format')
+    if sub_program:
+        parser.add_argument('--bam', help='bam file', required=True)
+        parser.add_argument('--bg', help='background snp file', required=True)
+        parser.add_argument('--cell_keep', type=int, default=100000, help='filter cell')
+        parser.add_argument('--min_cell', type=int, default=10, help='a gene expressed in at least cells, default 10')
+        parser.add_argument('--min_gene', type=int, default=10, help='at least gene num in a cell, default 10')
+        parser = s_common(parser) 
+    return parser
+
+
diff --git a/celescope/dynaseq/star.py b/celescope/dynaseq/star.py
new file mode 100755
index 00000000..525cfcf6
--- /dev/null
+++ b/celescope/dynaseq/star.py
@@ -0,0 +1,163 @@
+import subprocess
+
+import pandas as pd
+
+import celescope.tools.utils as utils
+from celescope.__init__ import ROOT_PATH
+from celescope.tools.star_mixin import StarMixin, get_opts_star_mixin
+from celescope.tools.step import Step
+
+
+class Star_dynaseq(Step, StarMixin):
+    """
+    Features
+    - Align R2 reads to the reference genome with STAR.
+    - Collect Metrics with Picard.
+
+    Output
+    - `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
+
+    - `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
+
+    - `{sample}_Log.out` Main log with a lot of detailed information about the run. 
+    This is most useful for troubleshooting and debugging.
+
+    - `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
+    % of mapped reads etc. It is updated in 1 minute intervals.
+
+    - `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
+    very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
+    then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
+    (unlike the samtools agstat/idxstats, which count each mate separately). 
+    Most of the information is collected about the UNIQUE mappers 
+    (unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
+    Each splicing is counted in the numbers of splices, which would correspond to 
+    summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
+    i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
+
+    - `{sample}_region.log` Picard CollectRnaSeqMetrics results.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        StarMixin.__init__(self, args)
+        # parse
+        self.refflat = f"{self.genomeDir}/{self.genome['refflat']}"
+
+        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
+        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
+        self.picard_region_log = f'{self.outdir}/{self.sample}_region.log'
+        self.plot = None
+        self.stats = pd.Series()
+
+    def add_other_metrics(self):
+        """
+        add picard region bases
+        add region plot
+        if debug, add ribosomal RNA reads percent
+        """
+
+        with open(self.picard_region_log, 'r') as picard_log:
+            region_dict = {}
+            for line in picard_log:
+                if not line:
+                    break
+                if line.startswith('## METRICS CLASS'):
+                    header = picard_log.readline().strip().split('\t')
+                    data = picard_log.readline().strip().split('\t')
+                    region_dict = dict(zip(header, data))
+                    break
+        
+        total = float(region_dict['PF_ALIGNED_BASES'])
+        exonic_regions = int(region_dict['UTR_BASES']) + \
+            int(region_dict['CODING_BASES'])
+        intronic_regions = int(region_dict['INTRONIC_BASES'])
+        intergenic_regions = int(region_dict['INTERGENIC_BASES'])
+
+        self.add_metric(
+            name='Base Pairs Mapped to Exonic Regions', 
+            value=exonic_regions, 
+            total=total,
+        )
+        self.add_metric(
+            name='Base Pairs Mapped to Intronic Regions',
+            value=intronic_regions,
+            total=total,
+        )
+        self.add_metric(
+            name='Base Pairs Mapped to Intergenic Regions',
+            value=intergenic_regions, 
+            total=total,
+        )
+
+        # ribo
+        if self.debug:
+            with open(self.ribo_log, 'r') as ribo_log:
+                for line in ribo_log:
+                    if line.find('#Matched') != -1:
+                        items = line.split()
+                        Reads_Mapped_to_rRNA = int(items[1])
+                    if line.find('#Total') != -1:
+                        items = line.split()
+                        Reads_Total = int(items[1])
+                self.add_metric(
+                    name=f'{self.stat_prefix} Mapped to rRNA',
+                    value=Reads_Mapped_to_rRNA,
+                    total=Reads_Total,
+                )
+
+        region_plot = {'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'],
+                'region_values': [exonic_regions, intronic_regions, intergenic_regions]}   
+        self.add_content_item("data", STAR_plot=region_plot)
+
+
+    @utils.add_log
+    def ribo(self):
+        human_ribo_fa = f'{ROOT_PATH}/data/rRNA/human_ribo.fasta'
+        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
+        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
+        cmd = (
+            f'bbduk.sh '
+            f'in1={self.fq} '
+            f'ref={human_ribo_fa} '
+            f'stats={self.ribo_log} '
+            f'overwrite=t '
+            f'> {self.ribo_run_log} 2>&1 '
+        )
+        Star_dynaseq.ribo.logger.info(cmd)
+        subprocess.check_call(cmd, shell=True)
+
+    @utils.add_log
+    def picard(self):
+        cmd = [
+            'picard',
+            '-Xmx20G',
+            '-XX:ParallelGCThreads=4',
+            'CollectRnaSeqMetrics',
+            'I=%s' % (self.STAR_bam),
+            'O=%s' % (self.picard_region_log),
+            'REF_FLAT=%s' % (self.refflat),
+            'STRAND=NONE',
+            'VALIDATION_STRINGENCY=SILENT']
+        cmd_str = ' '.join(cmd)
+        Star_dynaseq.picard.logger.info(cmd_str)
+        subprocess.check_call(cmd)
+
+    @utils.add_log
+    def run(self):
+        self.run_star()
+        self.picard()
+        if self.debug:
+            self.ribo()
+        self.add_other_metrics()
+        self.clean_up()
+
+
+def star(args):
+    step_name = "star"
+    runner = Star_dynaseq(args, step_name)
+    runner.run()
+
+
+def get_opts_star(parser, sub_program):
+    get_opts_star_mixin(parser, sub_program)
diff --git a/celescope/dynaseq/subsitution.py b/celescope/dynaseq/subsitution.py
new file mode 100755
index 00000000..44f251be
--- /dev/null
+++ b/celescope/dynaseq/subsitution.py
@@ -0,0 +1,208 @@
+#!/bin/env python
+# coding=utf8
+
+import os
+import pysam
+import re
+import pandas as pd
+import plotly
+import plotly.graph_objects as go
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+
+class Subsitution(Step):
+    """
+    Features
+    - Computes the overall conversion rates in reads and plots a barplot.
+
+    Output
+    - `{sample}.substitution.txt` Tab-separated table of the overall conversion rates.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        # input files
+        self.sample = args.sample 
+        self.bam_file = args.bam
+        self.outdir = args.outdir
+
+        # output files
+        self.outstat = os.path.join(self.outdir, self.sample+'.substitution.txt')
+
+    
+    @utils.add_log
+    def run(self):
+        # overall rate
+        for_base,rev_base,is_forward,is_reverse = self.get_sub_tag(self.bam_file)
+        self.sub_stat(for_base,rev_base,is_forward,is_reverse,self.outstat)
+        div_item = self.sub_plot(self.outstat)
+
+        self.report_prepare(div_item)
+        self.clean_up()
+
+
+    @utils.add_log
+    def get_sub_tag(self,bam):
+        bamfile = pysam.AlignmentFile(bam, 'rb')
+        is_reverse = {'cA':0, 'gA':0, 'tA':0, 'aC':0, 'gC':0, 'tC':0, 'aG':0, 'cG':0, 'tG':0, 'aT':0, 'cT':0, 'gT':0}
+        is_forward = {'cA':0, 'gA':0, 'tA':0, 'aC':0, 'gC':0, 'tC':0, 'aG':0, 'cG':0, 'tG':0, 'aT':0, 'cT':0, 'gT':0}
+        for_base = {'a':0, 'c':0, 'g':0, 't':0}
+        rev_base = {'a':0, 'c':0, 'g':0, 't':0}
+        snp_tags = ['','cA', 'gA', 'tA', 'aC', 'gC', 'tC', 'aG', 'cG', 'tG', 'aT', 'cT', 'gT']
+        ref_tags = ['','a','c','g','t']
+        for read in bamfile.fetch():
+            try:
+                snpmatch = re.match( r'cA(\d+);gA(\d+);tA(\d+);aC(\d+);gC(\d+);tC(\d+);aG(\d+);cG(\d+);tG(\d+);aT(\d+);cT(\d+);gT(\d+);', read.get_tag('SC'), re.M)
+                totmatch = re.match( r'a(\d+);c(\d+);g(\d+);t(\d+)', read.get_tag('TC'), re.M)
+                if snpmatch and totmatch:
+                    if read.is_reverse:
+                        for j in range(1,len(ref_tags)):
+                            rev_base[ref_tags[j]] += int(totmatch.group(j))
+                        for i in range(1,len(snp_tags)):
+                            is_reverse[snp_tags[i]] += int(snpmatch.group(i))
+                    else:
+                        for j in range(1,len(ref_tags)):
+                            for_base[ref_tags[j]] += int(totmatch.group(j))
+                        for i in range(1,len(snp_tags)):
+                            is_forward[snp_tags[i]] += int(snpmatch.group(i))
+            except (ValueError,KeyError):
+                continue
+        bamfile.close()
+
+        return for_base,rev_base,is_forward,is_reverse
+
+    @utils.add_log
+    def sub_stat(self,for_base,rev_base,is_forward,is_reverse,outfile):
+        convertdict = {'a':['aC','aG','aT'],
+                    'c':['cA','cG','cT'],
+                    'g':['gA','gC','gT'],
+                    't':['tA','tC','tG']}
+        subdict = {'a':'t','t':'a','c':'g','g':'c',
+                'aC':'tG','aG':'tC','aT':'tA',
+                'cA':'gT','cG':'gC','cT':'gA',
+                'gA':'cT','gC':'cG','gT':'cA',
+                'tA':'aT','tC':'aG','tG':'aC'}
+        outdict = {'aC':'A_to_C','aG':'A_to_G','aT':'A_to_T',
+                'cA':'C_to_A','cG':'C_to_G','cT':'C_to_T',
+                'gA':'G_to_A','gC':'G_to_C','gT':'G_to_T',
+                'tA':'T_to_A','tC':'T_to_C','tG':'T_to_G'}
+        outw = open(outfile,'w')
+        for x in ['a','c','g','t']:
+            fbase = for_base[x]
+            rbase = rev_base[subdict[x]]
+            for y in convertdict[x]:
+                fcov = is_forward[y]*100 / float(fbase)
+                rcov = is_reverse[subdict[y]]*100 / float(rbase)
+                outw.write(outdict[y]+'\t'+"%.3f"%fcov+'\t'+"%.3f"%rcov+'\n')
+        outw.close()
+
+    @utils.add_log
+    def sub_plot(self,txt):
+        df = pd.read_table(txt, header=None)
+        df.columns = ['sample', '+', '-']
+
+        fig = go.Figure()
+        ## 设置颜色：
+        import plotly.express as px
+        num4colors  = 0
+        num4rainbow = 0
+        colors_list = []
+        while num4colors<100:
+            if num4rainbow == 9:
+                num4rainbow = 0
+            colors_list.append(px.colors.qualitative.Plotly[num4rainbow])
+            num4colors+=1
+            num4rainbow+=1
+
+        num4sample = 0
+        colors4sample = {}
+        num4x = 0
+
+        for sample in df['sample'].unique():    
+            legend_show = True
+            colors4sample[sample] = colors_list[num4sample]
+            num4sample += 1
+            flag_x = 'x' + str(num4x+1)
+            df_plot = df[ df['sample'] == sample ]
+            num4x+=1
+        
+            fig.add_trace(go.Bar(name=sample+'+',  
+                    x=df_plot['sample'], 
+                    y=df_plot['+'],
+                    legendgroup=sample,
+                    marker_color=colors4sample[sample],
+                    marker_line_color='#FFFFFF',
+                    showlegend=legend_show,
+                    xaxis=flag_x)
+            )
+            fig.add_trace(go.Bar(name=sample+'-',  
+                    x=df_plot['sample'], 
+                    y=df_plot['-'],
+                    legendgroup=sample,
+                    showlegend=legend_show,
+                    marker_color=colors4sample[sample],
+                    marker_line_color='#FFFFFF',
+                    opacity=0.3,
+                    xaxis=flag_x)
+            )
+
+        fig.update_layout(barmode='stack')
+
+        per     = 1/(num4x+1)
+        gap4bar = per/len(df['sample'].unique())
+        num4x = 0
+        for typeB in df['sample'].unique():
+            if num4x == 0:
+                flag_x = 'xaxis'
+            else:
+                flag_x = 'xaxis' + str(num4x+1)
+            anchor_x = 'x'+str(num4x+1)
+            num4x += 1
+            fig['layout'][flag_x] = dict(domain=[per*num4x, per*(num4x+1)-gap4bar], anchor=anchor_x, title=typeB)
+
+        fig.update_layout(plot_bgcolor = '#FFFFFF')
+        fig.update_xaxes(showgrid=False, linecolor='black', showline=True, ticks='outside', showticklabels=False)
+        fig.update_yaxes(showgrid=False, linecolor='black', showline=True, ticks='outside')
+        width_num = 400 * ( len(df['sample'].unique())* len(df['sample'].unique()) ) / (5*12) ## 控制柱形图的宽度
+        fig.update_layout(height=500, width=width_num)
+        fig.update_layout(legend=dict(orientation="h"))
+        fig.update_layout(legend=dict(
+            yanchor="top",
+            y=1.3,
+            xanchor="left",
+            x=0.05,
+            valign="top",
+        ))
+
+        fig.update_layout(
+        yaxis_title="Rates of nucleotide substitution (%)",
+        )
+        fig.update_xaxes(
+            tickangle = -80,
+            title_font = {"size": 15},
+            title_standoff = 25
+        )
+
+        div = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div')
+
+        return div
+
+
+    def report_prepare(self,outdiv):
+        self.add_data_item(subsitution=outdiv)
+
+
+@utils.add_log
+def subsitution(args):
+
+    step_name = "subsitution"
+    subsitution_obj = Subsitution(args, step_name)
+    subsitution_obj.run()
+
+def get_opts_subsitution(parser, sub_program):
+    if sub_program:
+        parser.add_argument('--bam', help='bam file', required=True)
+        parser = s_common(parser) 
+    return parser
diff --git a/celescope/templates/html/dynaseq/base.html b/celescope/templates/html/dynaseq/base.html
new file mode 100755
index 00000000..e7adebfd
--- /dev/null
+++ b/celescope/templates/html/dynaseq/base.html
@@ -0,0 +1,156 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+  <meta charset="UTF-8" />
+  <meta http-equiv="X-UA-Compatible" content="IE=edge,Chrome=1" />
+        <title>report</title>
+        <link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/4.7.0/css/font-awesome.min.css"> 
+        <script> {% include "./js/jquery.min.js" %} </script>
+        <script> {% include "./js/plotly-1.58.4.min.js" %} </script>
+        
+
+        
+        <script type="text/javascript">
+            
+            function toggle1(node){
+                var helpNode = node.parentNode.parentNode.lastElementChild.firstElementChild;
+                if (helpNode.style.display=="inline"){
+                    helpNode.style.display="none";
+                } else {
+                    helpNode.style.display="inline";
+                }
+            }
+
+        </script>
+        
+        <style>
+            .box {
+                margin-top: 2px;
+                margin-bottom: 2px;
+                width: 100%;
+                background-color: rgba(255,255,255,0.5);
+                padding: 15px 14px;
+                border-radius: 5px;
+                border: 1px solid rgba(0,0,0,0.1);
+  
+            }
+
+            table {
+                border-collapse: collapse;
+                width: 100%;
+                margin-bottom: 1%;
+                border-spacing: 0 2em;
+            }
+
+            th, td {
+                border-bottom: 1px solid #ddd;
+                
+            }
+
+            .box table td:first-child{
+                text-align: left;
+                padding-bottom: 5px;
+            }
+
+            .box table td{
+                text-align: right;
+                padding-top: 5px;
+                padding-bottom: 5px;
+            }
+
+            .logo_container {
+                height: 50px;
+                width: 150px;
+                position: relative;
+            }
+            .description {
+                margin-top: 5px;
+            }
+            .description p{
+                text-align: left;
+                margin-top: 0px;
+                margin-bottom: 0px;
+                line-height: 28px !important;
+                font-size: 18px !important;
+            }
+            .has_desc:after {
+                position: relative;
+                background: rgba(0,0,0,0.1);
+                color: white;
+                width: 18px;
+                height: 18px;
+                border-radius: 18px;
+                cursor: pointer;
+                float: right;
+                text-align: center;
+                line-height: 20px;
+                content: '?';
+            }
+
+            .clear{ clear:both}
+
+            .js-plotly-plot .plotly .modebar {
+                position: absolute;
+                top: 2px;
+                right: 2px;
+                z-index: 1001;
+                background: rgba(255, 255, 255, 0.7) none repeat scroll 0% 0%;
+            }
+
+        </style>
+</head>
+<body>
+  <header class="navbar navbar-fixed-top" style="background: #ffffff">
+    <div class="navbar-brand">
+      {% include "html/common/logo.html"%}
+    </div>
+  </header>
+
+  <div class="container">
+      <h1 align="center">DynaSCOPE Report </h1>
+
+      {% if sample_summary is defined %}
+      {% include "html/common/sample_summary.html"%}
+      {% endif %}
+
+      {% if barcode_summary is defined %}
+      {% include "html/common/barcode_summary.html"%}
+      {% endif %}
+
+      {% if cutadapt_summary is defined %}
+      {% include "html/common/cutadapt_summary.html"%}
+      {% endif %}
+
+      {% if star_summary is defined %}
+      {% include "html/rna/star_summary.html"%}
+      {% endif %}
+
+      {% if featureCounts_summary is defined%}
+      {% include "html/rna/featureCounts_summary.html"%}
+      {% endif %}
+
+      {% if umi_summary is defined %}
+      {% include "html/rna/umi_summary.html"%}
+      {% endif %}
+
+      {% if cluster_tsne is defined %}
+      {% include "html/rna/analysis_summary.html"%}
+      {% endif %}
+
+      {% if subsitution is defined %}
+      {% include "html/dynaseq/subsitution_summary.html"%}
+      {% endif %}
+
+      {% if replacement is defined %}
+      {% include "html/dynaseq/replacement_summary.html"%}
+      {% endif %}
+
+      {% if replace_tsne is defined %}
+      {% include "html/dynaseq/replace_tsne_summary.html"%}
+      {% endif %}
+
+    </div>
+  </div>
+
+</body>
+</html>
diff --git a/celescope/templates/html/dynaseq/replace_tsne_summary.html b/celescope/templates/html/dynaseq/replace_tsne_summary.html
new file mode 100644
index 00000000..d1f927e5
--- /dev/null
+++ b/celescope/templates/html/dynaseq/replace_tsne_summary.html
@@ -0,0 +1,54 @@
+<h3>RNA Turn-over rate in clusters</h3>
+
+{{ replace_tsne|safe }}
+
+
+<script>
+    $(document).ready(function () {
+        var table = $('#replacement_table_cluster').DataTable({
+            dom: 'Bfrtip',
+            buttons: ['excel']
+        });
+        table.columns(0).order('asc').draw();
+    });
+
+    $(document).ready(function () {
+            var table = $('#replacement_table_cluster').DataTable();
+            var indexOfMyCol = 0 ;
+            var collator = new Intl.Collator(undefined, {numeric: true, sensitivity: 'base'});
+    $("#replacement_table_cluster thead th").each( function ( i ) {
+        if (i==indexOfMyCol){
+
+          var select = $('<select><option value=""></option></select>')
+            .appendTo( $(this).empty() )
+            .on( 'change', function () {
+                var pattern = ""
+                if ($(this).val()!="") {
+                    pattern= pattern="^"+$(this).val() +"$"
+                }
+                table.column( i )
+                .search(input=pattern, regex=true, smart=false)
+                .draw();
+            } );
+ 
+        table.column( i).data().unique().sort(collator.compare).each( function ( d, j ) {
+            select.append( '<option value="'+d+'">'+d+'</option>' )
+        } );
+    }
+    } );
+    });
+</script>
+
+        <h3>High Turn-over Genes in Cluster</h3>
+        <div style=" margin-left: 3%; margin-right:3%;">
+
+	{{ replace_tsne_table|safe }}
+
+
+
+        </div>
+
+      <div class="clear" ></div>
+    </div>
+  </div>
+
diff --git a/celescope/templates/html/dynaseq/replacement_summary.html b/celescope/templates/html/dynaseq/replacement_summary.html
new file mode 100644
index 00000000..bcf91dbd
--- /dev/null
+++ b/celescope/templates/html/dynaseq/replacement_summary.html
@@ -0,0 +1,3 @@
+<h3>RNA Turn-over rate</h3>
+
+{{ replacement|safe }}
diff --git a/celescope/templates/html/dynaseq/subsitution_summary.html b/celescope/templates/html/dynaseq/subsitution_summary.html
new file mode 100644
index 00000000..c774021f
--- /dev/null
+++ b/celescope/templates/html/dynaseq/subsitution_summary.html
@@ -0,0 +1,13 @@
+
+<div class="abc" style="float: left; margin-left: 15%; margin-right:15%; width: 70%" >
+<h2>Dynaseq Analysis   <i class="fa fa-question-circle" onClick="toggle1(this)" style="cursor:pointer;"></i></h2>
+<div class="box">
+    <div class="description" style="display: none;">
+        <p><b>Substitution rate</b> : Overall nucleotide substitution rates in a sample.</p>
+        <p><b>Turn-over rate</b> : Labeled transcripts (UMI) fraction per cell or per gene.</p>
+    </div>
+
+
+<h3>Sample substitution rate</h3>
+{{ subsitution|safe }}
+
diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index ae938ad0..306488e2 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -79,6 +79,7 @@ class Multi():
     def parse_map_col4(mapfile, default_val):
         fq_dict = defaultdict(list)
         col4_dict = {}
+        col5_dict = {}
         with open(mapfile) as fh:
             for line in fh:
                 line = line.strip()
@@ -86,7 +87,7 @@ class Multi():
                     continue
                 line_split = line.split()
                 library_id, library_path, sample_name = line_split[:3]
-                if len(line_split) == 4:
+                if len(line_split) >= 4:
                     col4 = line_split[3]
                 else:
                     col4 = default_val
@@ -98,7 +99,8 @@ class Multi():
                 else:
                     fq_dict[sample_name] = [[fq1], [fq2]]
                     col4_dict[sample_name] = col4
-
+                if len(line_split) == 5:
+                    col5_dict[sample_name] = line_split[4]
 
         for sample_name in fq_dict:
             fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0])
@@ -106,7 +108,7 @@ class Multi():
 
         if not fq_dict:
             raise Exception('empty mapfile!')
-        return fq_dict, col4_dict
+        return fq_dict, col4_dict,col5_dict
 
     def link_data(self):
         raw_dir = f'{self.args.outdir}/data_give/rawdata'
@@ -122,7 +124,7 @@ class Multi():
         parse_mapfile, link data, make log dir, init script variables, init outdir_dic
         """
         # parse_mapfile
-        self.fq_dict, self.col4_dict = self.parse_map_col4(self.args.mapfile, self.col4_default)
+        self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default)
 
         # link
         self.link_data()
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 6baa77b6..f8e73fde 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -437,6 +437,7 @@ def get_fq(library_id, library_path):
 def parse_map_col4(mapfile, default_val):
     fq_dict = defaultdict(list)
     col4_dict = defaultdict(list)
+    col5_dict = defaultdict(list)
     with open(mapfile) as fh:
         for line in fh:
             line = line.strip()
@@ -448,7 +449,7 @@ def parse_map_col4(mapfile, default_val):
             library_id = tmp[0]
             library_path = tmp[1]
             sample_name = tmp[2]
-            if len(tmp) == 4:
+            if len(tmp) >= 4:
                 col4 = tmp[3]
             else:
                 col4 = default_val
@@ -461,6 +462,8 @@ def parse_map_col4(mapfile, default_val):
                 fq_dict[sample_name] = [[fq1], [fq2]]
             if col4 and col4 != default_val:
                 col4_dict[sample_name] = col4
+            if len(tmp) == 5:
+                col5_dict[sample_name] = tmp[4]
 
     for sample_name in fq_dict:
         fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0])
@@ -468,7 +471,7 @@ def parse_map_col4(mapfile, default_val):
 
     if not fq_dict:
         raise Exception('empty mapfile!')
-    return fq_dict, col4_dict
+    return fq_dict, col4_dict, col5_dict
 
 
 def generate_sjm(cmd, name, conda, m=1, x=1):
diff --git a/docs/dynaseq/analysis.md b/docs/dynaseq/analysis.md
new file mode 100644
index 00000000..d2a38bc9
--- /dev/null
+++ b/docs/dynaseq/analysis.md
@@ -0,0 +1,51 @@
+## Features
+- Cell clustering with Seurat.
+
+- Calculate the marker gene of each cluster.
+
+- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
+
+## Output
+- `markers.tsv` Marker genes of each cluster.
+
+- `tsne_coord.tsv` t-SNE coordinates and clustering information.
+
+- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
+parameter is provided. The result contains 3 files:
+    - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
+it means that the given marker is not enough to identify the cluster.
+    - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
+    - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--save_rds` Write rds to disk.
+
+`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```
+
+`--matrix_file` Required. Matrix_10X directory from step count.
+
+`--outdir` output dir
+
+`--assay` assay
+
+`--sample` sample name
+
+`--thread` None
+
+`--debug` debug
+
diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md
new file mode 100644
index 00000000..bfb0cb2a
--- /dev/null
+++ b/docs/dynaseq/conversion.md
@@ -0,0 +1,26 @@
+## Features
+- Get conversion pos in each read.
+    - Get snp info. 
+
+## Output
+- `{sample}.PosTag.bam` Bam file with conversion info.
+- `{sample}.PosTag.csv` SNP info in csv format.
+
+
+## Arguments
+`--strand` gene strand file
+
+`--bam` featureCount bam
+
+`--cell` barcode cell list
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md
new file mode 100644
index 00000000..31ed90ce
--- /dev/null
+++ b/docs/dynaseq/replace_tsne.md
@@ -0,0 +1,30 @@
+## Features
+- Replace rate in each cluster
+- Top replace genes in each cluster
+
+## Output
+- `{sample}.rep_in_tsne.txt` Replace rate in each cluster.
+- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster.
+
+
+## Arguments
+`--tsne` tsne file
+
+`--mat` matrix rep file
+
+`--rep` cell rep file
+
+`--mincell` turn-over in at least cells, default 5
+
+`--topgene` top N genes,default 10
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md
new file mode 100644
index 00000000..1184777c
--- /dev/null
+++ b/docs/dynaseq/replacement.md
@@ -0,0 +1,36 @@
+## Features
+- Computes the replacement rates in each cell and gene.
+- Boxplots for rates distribution.
+
+## Output
+- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi.
+- `{sample}.new_matrix.tsv.gz` New RNA matrix.
+- `{sample}.old_matrix.tsv.gz` Old RNA matrix.
+- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell.
+- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene.
+- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene.
+
+
+## Arguments
+`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format
+
+`--bam` bam file
+
+`--bg` background snp file
+
+`--cell_keep` filter cell
+
+`--min_cell` a gene expressed in at least cells, default 10
+
+`--min_gene` at least gene num in a cell, default 10
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/star.md b/docs/dynaseq/star.md
new file mode 100644
index 00000000..ec3b5211
--- /dev/null
+++ b/docs/dynaseq/star.md
@@ -0,0 +1,56 @@
+## Features
+- Align R2 reads to the reference genome with STAR.
+- Collect Metrics with Picard.
+
+## Output
+- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
+
+- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
+
+- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
+This is most useful for troubleshooting and debugging.
+
+- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
+% of mapped reads etc. It is updated in 1 minute intervals.
+
+- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
+very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
+then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
+(unlike the samtools agstat/idxstats, which count each mate separately). 
+Most of the information is collected about the UNIQUE mappers 
+(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
+Each splicing is counted in the numbers of splices, which would correspond to 
+summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
+i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
+
+- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--fq` Required. R2 fastq file.
+
+`--consensus_fq` Input fastq has been consensused
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md
new file mode 100644
index 00000000..e2b7b169
--- /dev/null
+++ b/docs/dynaseq/subsitution.md
@@ -0,0 +1,20 @@
+## Features
+- Computes the overall conversion rates in reads and plots a barplot.
+
+## Output
+- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates.
+
+
+## Arguments
+`--bam` bam file
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
-- 
Gitee


From 91de4f07f43dc6954d6fdafd5651a2fca92bb9c8 Mon Sep 17 00:00:00 2001
From: Tony Zhou <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 09:45:37 +0800
Subject: [PATCH 75/96] fix pysam.VariantFile

---
 celescope/dynaseq/replacement.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/celescope/dynaseq/replacement.py b/celescope/dynaseq/replacement.py
index c8ac2a15..8c6a8119 100755
--- a/celescope/dynaseq/replacement.py
+++ b/celescope/dynaseq/replacement.py
@@ -138,8 +138,7 @@ class Replacement(Step):
                     chr_pos = ii[1]+'_'+ii[5]
                     outdict[chr_pos] = 1
         elif bgfile.endswith('.vcf'):
-            from pysam import VariantFile
-            bcf_in = VariantFile(bgfile)
+            bcf_in = pysam.VariantFile(bgfile)
             for rec in bcf_in.fetch():
                 try:
                     chrom, pos = rec.chrom, rec.pos
-- 
Gitee


From 89838794667a08bad5bcc2797d853afa151ae4ec Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 10:16:16 +0800
Subject: [PATCH 76/96] change tests folder position

---
 docs/CONTRIBUTING.md                        | 4 ++--
 setup.sh                                    | 7 -------
 {celescope/tests => tests}/__init__.py      | 0
 {celescope/tests => tests}/conftest.py      | 0
 {celescope/tests => tests}/test_function.py | 0
 {celescope/tests => tests}/test_multi.py    | 0
 6 files changed, 2 insertions(+), 9 deletions(-)
 delete mode 100755 setup.sh
 rename {celescope/tests => tests}/__init__.py (100%)
 rename {celescope/tests => tests}/conftest.py (100%)
 rename {celescope/tests => tests}/test_function.py (100%)
 rename {celescope/tests => tests}/test_multi.py (100%)

diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
index 782c33ec..fb9ee986 100644
--- a/docs/CONTRIBUTING.md
+++ b/docs/CONTRIBUTING.md
@@ -98,9 +98,9 @@ git clone https://github.com/singleron-RD/celescope_tests.git
 Install pytest
 >>> pip install pytest
 Run all
->>> pytest -s celescope/tests/test_multi.py --test_dir {test_dir}
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir}
 Run some tests
->>> pytest -s celescope/tests/test_multi.py --test_dir {test_dir} --assays rna,tag
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag
 ```
 
 Then you need to create your own test based on this example.
\ No newline at end of file
diff --git a/setup.sh b/setup.sh
deleted file mode 100755
index 082ee59b..00000000
--- a/setup.sh
+++ /dev/null
@@ -1,7 +0,0 @@
-git clone https://github.com/singleron-RD/CeleScope.git
-
-conda create -n celescope
-conda activate celescope
-conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing
-
-pip install celescope
diff --git a/celescope/tests/__init__.py b/tests/__init__.py
similarity index 100%
rename from celescope/tests/__init__.py
rename to tests/__init__.py
diff --git a/celescope/tests/conftest.py b/tests/conftest.py
similarity index 100%
rename from celescope/tests/conftest.py
rename to tests/conftest.py
diff --git a/celescope/tests/test_function.py b/tests/test_function.py
similarity index 100%
rename from celescope/tests/test_function.py
rename to tests/test_function.py
diff --git a/celescope/tests/test_multi.py b/tests/test_multi.py
similarity index 100%
rename from celescope/tests/test_multi.py
rename to tests/test_multi.py
-- 
Gitee


From 41972ec591a897d4cba5e47d8b71ebcc7af83624 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 10:44:05 +0800
Subject: [PATCH 77/96] remove duplicated star and analysis from dynaseq

---
 celescope/dynaseq/__init__.py |   5 ++
 celescope/dynaseq/analysis.py | 104 ----------------------
 celescope/dynaseq/star.py     | 163 ----------------------------------
 tests/test_multi.py           |   1 +
 4 files changed, 6 insertions(+), 267 deletions(-)
 delete mode 100755 celescope/dynaseq/analysis.py
 delete mode 100755 celescope/dynaseq/star.py

diff --git a/celescope/dynaseq/__init__.py b/celescope/dynaseq/__init__.py
index 6ed6df1f..ae7742a4 100755
--- a/celescope/dynaseq/__init__.py
+++ b/celescope/dynaseq/__init__.py
@@ -13,6 +13,11 @@ __STEPS__ = [
 
 __ASSAY__ = 'dynaseq'
 
+IMPORT_DICT = {
+    'star': 'celescope.rna',
+    'analysis': 'celescope.rna',
+}
+
 # m: memory 
 # x: thread
 RESOURCE = {
diff --git a/celescope/dynaseq/analysis.py b/celescope/dynaseq/analysis.py
deleted file mode 100755
index ad5ec8f8..00000000
--- a/celescope/dynaseq/analysis.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import pandas as pd
-
-from celescope.tools.analysis_mixin import AnalysisMixin
-from celescope.tools.step import Step
-from celescope.tools.utils import add_log, get_id_name_dict, s_common
-
-
-@add_log
-def generate_matrix(gtf_file, matrix_file):
-
-    id_name = get_id_name_dict(gtf_file)
-    matrix = pd.read_csv(matrix_file, sep="\t")
-
-    gene_name_col = matrix.geneID.apply(lambda x: id_name[x])
-    matrix.geneID = gene_name_col
-    matrix = matrix.drop_duplicates(subset=["geneID"], keep="first")
-    matrix = matrix.dropna()
-    matrix = matrix.rename({"geneID": ""}, axis='columns')
-    return matrix
-
-
-class Analysis_dynaseq(Step, AnalysisMixin):
-    """
-    Features
-    - Cell clustering with Seurat.
-
-    - Calculate the marker gene of each cluster.
-
-    - Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
-
-    Output
-    - `markers.tsv` Marker genes of each cluster.
-
-    - `tsne_coord.tsv` t-SNE coordinates and clustering information.
-
-    - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
-    parameter is provided. The result contains 3 files:
-	- `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
-    it means that the given marker is not enough to identify the cluster.
-	- `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
-	- `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
-    """
-    def __init__(self, args, step_name):
-        Step.__init__(self, args, step_name)
-        AnalysisMixin.__init__(self, args)
-        self.matrix_file = args.matrix_file
-        self.genomeDir = args.genomeDir
-        self.type_marker_tsv = args.type_marker_tsv
-        self.auto_assign_bool = False
-        self.save_rds = args.save_rds
-        if args.type_marker_tsv and args.type_marker_tsv != 'None':
-            self.auto_assign_bool = True
-            self.save_rds = True
-
-    def run(self):
-        self.seurat(self.matrix_file, self.save_rds, self.genomeDir)
-        if self.auto_assign_bool:
-            self.auto_assign(self.type_marker_tsv)
-        self.run_analysis()
-        self.add_data_item(cluster_tsne=self.cluster_tsne)
-        self.add_data_item(gene_tsne=self.gene_tsne)
-        self.add_data_item(table_dict=self.table_dict)
-
-        self.clean_up()
-
-
-@add_log
-def analysis(args):
-
-    step_name = "analysis"
-    ana = Analysis_dynaseq(args, step_name)
-    ana.run()
-
-
-def get_opts_analysis(parser, sub_program):
-
-    parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True)
-    parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.')
-    parser.add_argument(
-        '--type_marker_tsv', 
-        help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
-```
-cell_type	marker
-Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
-Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
-Epithelial	"CAPS,TMEM190,PIFO,SNTN"
-Fibroblast	"COL1A1,DCN,COL1A2,C1R"
-B_cell	"CD79A,IGKC,IGLC3,IGHG3"
-Myeloid	"LYZ,MARCO,FCGR3A"
-T_cell	"CD3D,TRBC1,TRBC2,TRAC"
-LUAD	"NKX2-1,NAPSA,EPCAM"
-LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
-```"""
-    )
-    if sub_program:
-        parser.add_argument(
-            '--matrix_file', 
-            help='Required. Matrix_10X directory from step count.', 
-            required=True,
-        )
-        parser = s_common(parser)
-
-
-
diff --git a/celescope/dynaseq/star.py b/celescope/dynaseq/star.py
deleted file mode 100755
index 525cfcf6..00000000
--- a/celescope/dynaseq/star.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import subprocess
-
-import pandas as pd
-
-import celescope.tools.utils as utils
-from celescope.__init__ import ROOT_PATH
-from celescope.tools.star_mixin import StarMixin, get_opts_star_mixin
-from celescope.tools.step import Step
-
-
-class Star_dynaseq(Step, StarMixin):
-    """
-    Features
-    - Align R2 reads to the reference genome with STAR.
-    - Collect Metrics with Picard.
-
-    Output
-    - `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
-
-    - `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
-
-    - `{sample}_Log.out` Main log with a lot of detailed information about the run. 
-    This is most useful for troubleshooting and debugging.
-
-    - `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
-    % of mapped reads etc. It is updated in 1 minute intervals.
-
-    - `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
-    very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
-    then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
-    (unlike the samtools agstat/idxstats, which count each mate separately). 
-    Most of the information is collected about the UNIQUE mappers 
-    (unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
-    Each splicing is counted in the numbers of splices, which would correspond to 
-    summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
-    i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
-
-    - `{sample}_region.log` Picard CollectRnaSeqMetrics results.
-    """
-
-    def __init__(self, args, step_name):
-        Step.__init__(self, args, step_name)
-        StarMixin.__init__(self, args)
-        # parse
-        self.refflat = f"{self.genomeDir}/{self.genome['refflat']}"
-
-        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
-        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
-        self.picard_region_log = f'{self.outdir}/{self.sample}_region.log'
-        self.plot = None
-        self.stats = pd.Series()
-
-    def add_other_metrics(self):
-        """
-        add picard region bases
-        add region plot
-        if debug, add ribosomal RNA reads percent
-        """
-
-        with open(self.picard_region_log, 'r') as picard_log:
-            region_dict = {}
-            for line in picard_log:
-                if not line:
-                    break
-                if line.startswith('## METRICS CLASS'):
-                    header = picard_log.readline().strip().split('\t')
-                    data = picard_log.readline().strip().split('\t')
-                    region_dict = dict(zip(header, data))
-                    break
-        
-        total = float(region_dict['PF_ALIGNED_BASES'])
-        exonic_regions = int(region_dict['UTR_BASES']) + \
-            int(region_dict['CODING_BASES'])
-        intronic_regions = int(region_dict['INTRONIC_BASES'])
-        intergenic_regions = int(region_dict['INTERGENIC_BASES'])
-
-        self.add_metric(
-            name='Base Pairs Mapped to Exonic Regions', 
-            value=exonic_regions, 
-            total=total,
-        )
-        self.add_metric(
-            name='Base Pairs Mapped to Intronic Regions',
-            value=intronic_regions,
-            total=total,
-        )
-        self.add_metric(
-            name='Base Pairs Mapped to Intergenic Regions',
-            value=intergenic_regions, 
-            total=total,
-        )
-
-        # ribo
-        if self.debug:
-            with open(self.ribo_log, 'r') as ribo_log:
-                for line in ribo_log:
-                    if line.find('#Matched') != -1:
-                        items = line.split()
-                        Reads_Mapped_to_rRNA = int(items[1])
-                    if line.find('#Total') != -1:
-                        items = line.split()
-                        Reads_Total = int(items[1])
-                self.add_metric(
-                    name=f'{self.stat_prefix} Mapped to rRNA',
-                    value=Reads_Mapped_to_rRNA,
-                    total=Reads_Total,
-                )
-
-        region_plot = {'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'],
-                'region_values': [exonic_regions, intronic_regions, intergenic_regions]}   
-        self.add_content_item("data", STAR_plot=region_plot)
-
-
-    @utils.add_log
-    def ribo(self):
-        human_ribo_fa = f'{ROOT_PATH}/data/rRNA/human_ribo.fasta'
-        self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt'
-        self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log'
-        cmd = (
-            f'bbduk.sh '
-            f'in1={self.fq} '
-            f'ref={human_ribo_fa} '
-            f'stats={self.ribo_log} '
-            f'overwrite=t '
-            f'> {self.ribo_run_log} 2>&1 '
-        )
-        Star_dynaseq.ribo.logger.info(cmd)
-        subprocess.check_call(cmd, shell=True)
-
-    @utils.add_log
-    def picard(self):
-        cmd = [
-            'picard',
-            '-Xmx20G',
-            '-XX:ParallelGCThreads=4',
-            'CollectRnaSeqMetrics',
-            'I=%s' % (self.STAR_bam),
-            'O=%s' % (self.picard_region_log),
-            'REF_FLAT=%s' % (self.refflat),
-            'STRAND=NONE',
-            'VALIDATION_STRINGENCY=SILENT']
-        cmd_str = ' '.join(cmd)
-        Star_dynaseq.picard.logger.info(cmd_str)
-        subprocess.check_call(cmd)
-
-    @utils.add_log
-    def run(self):
-        self.run_star()
-        self.picard()
-        if self.debug:
-            self.ribo()
-        self.add_other_metrics()
-        self.clean_up()
-
-
-def star(args):
-    step_name = "star"
-    runner = Star_dynaseq(args, step_name)
-    runner.run()
-
-
-def get_opts_star(parser, sub_program):
-    get_opts_star_mixin(parser, sub_program)
diff --git a/tests/test_multi.py b/tests/test_multi.py
index fe853db7..b1004f16 100755
--- a/tests/test_multi.py
+++ b/tests/test_multi.py
@@ -15,6 +15,7 @@ ASSAYS = [
     'capture_virus',
     'snp',
     'rna',
+    'dynaseq',
 ]
 
 
-- 
Gitee


From f1b13b59da28ceda817cbfb4c92939828d9c947a Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 10:50:02 +0800
Subject: [PATCH 78/96] analysis fix

---
 celescope/rna/analysis.py | 10 +++----
 docs/dynaseq/analysis.md  | 51 -----------------------------------
 docs/dynaseq/star.md      | 56 ---------------------------------------
 docs/rna/analysis.md      | 10 +++----
 4 files changed, 10 insertions(+), 117 deletions(-)
 delete mode 100644 docs/dynaseq/analysis.md
 delete mode 100644 docs/dynaseq/star.md

diff --git a/celescope/rna/analysis.py b/celescope/rna/analysis.py
index 8045e329..50cc4e58 100755
--- a/celescope/rna/analysis.py
+++ b/celescope/rna/analysis.py
@@ -1,14 +1,14 @@
 import pandas as pd
 
 from celescope.tools.analysis_mixin import AnalysisMixin
-from celescope.tools.step import Step
-from celescope.tools.utils import add_log, get_id_name_dict, s_common
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
 
 
-@add_log
+@utils.add_log
 def generate_matrix(gtf_file, matrix_file):
 
-    id_name = get_id_name_dict(gtf_file)
+    id_name = utils.get_id_name_dict(gtf_file)
     matrix = pd.read_csv(matrix_file, sep="\t")
 
     gene_name_col = matrix.geneID.apply(lambda x: id_name[x])
@@ -65,7 +65,7 @@ class Analysis_rna(Step, AnalysisMixin):
         self.clean_up()
 
 
-@add_log
+@utils.add_log
 def analysis(args):
 
     step_name = "analysis"
diff --git a/docs/dynaseq/analysis.md b/docs/dynaseq/analysis.md
deleted file mode 100644
index d2a38bc9..00000000
--- a/docs/dynaseq/analysis.md
+++ /dev/null
@@ -1,51 +0,0 @@
-## Features
-- Cell clustering with Seurat.
-
-- Calculate the marker gene of each cluster.
-
-- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
-
-## Output
-- `markers.tsv` Marker genes of each cluster.
-
-- `tsne_coord.tsv` t-SNE coordinates and clustering information.
-
-- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
-parameter is provided. The result contains 3 files:
-    - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
-it means that the given marker is not enough to identify the cluster.
-    - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
-    - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--save_rds` Write rds to disk.
-
-`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
-```
-cell_type	marker
-Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
-Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
-Epithelial	"CAPS,TMEM190,PIFO,SNTN"
-Fibroblast	"COL1A1,DCN,COL1A2,C1R"
-B_cell	"CD79A,IGKC,IGLC3,IGHG3"
-Myeloid	"LYZ,MARCO,FCGR3A"
-T_cell	"CD3D,TRBC1,TRBC2,TRAC"
-LUAD	"NKX2-1,NAPSA,EPCAM"
-LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
-```
-
-`--matrix_file` Required. Matrix_10X directory from step count.
-
-`--outdir` output dir
-
-`--assay` assay
-
-`--sample` sample name
-
-`--thread` None
-
-`--debug` debug
-
diff --git a/docs/dynaseq/star.md b/docs/dynaseq/star.md
deleted file mode 100644
index ec3b5211..00000000
--- a/docs/dynaseq/star.md
+++ /dev/null
@@ -1,56 +0,0 @@
-## Features
-- Align R2 reads to the reference genome with STAR.
-- Collect Metrics with Picard.
-
-## Output
-- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
-
-- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
-
-- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
-This is most useful for troubleshooting and debugging.
-
-- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
-% of mapped reads etc. It is updated in 1 minute intervals.
-
-- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
-very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
-then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
-(unlike the samtools agstat/idxstats, which count each mate separately). 
-Most of the information is collected about the UNIQUE mappers 
-(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
-Each splicing is counted in the numbers of splices, which would correspond to 
-summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
-i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
-
-- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads
-
-`--STAR_param` Other STAR parameters
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md
index d2a38bc9..9ddfd1b3 100644
--- a/docs/rna/analysis.md
+++ b/docs/rna/analysis.md
@@ -39,13 +39,13 @@ LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
 
 `--matrix_file` Required. Matrix_10X directory from step count.
 
-`--outdir` output dir
+`--outdir` Output diretory.
 
-`--assay` assay
+`--assay` Assay name.
 
-`--sample` sample name
+`--sample` Sample name.
 
-`--thread` None
+`--thread` Thread to use.
 
-`--debug` debug
+`--debug` If this argument is used, celescope may output addtional file for debugging.
 
-- 
Gitee


From f778ab0483c0629f18e5df20991f1044f296d31c Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 24 Jun 2021 11:37:19 +0800
Subject: [PATCH 79/96] add assembly length ang clean fq

---
 celescope/trust_vdj/res_filter.py     | 29 +++++++++++---
 celescope/trust_vdj/trust_assemble.py | 58 ++++++++++++++++++++++-----
 2 files changed, 71 insertions(+), 16 deletions(-)

diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index 883ae849..ed7a2720 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -4,10 +4,24 @@ from celescope.tools import utils
 from collections import defaultdict
 from celescope.tools.cellranger3 import get_plot_elements
 import numpy as np
+import pysam
+
+
+def get_len(fa):
+    with pysam.FastaFile(fa) as fh:
+        res = {}
+        names = fh.references
+        lengths = fh.lengths
+        res['contig_id'] = names
+        res['length'] = lengths
+        
+        df = pd.DataFrame(res, columns=list(res.keys()))
+        return df
 
 
 @utils.add_log
-def beauty_report(barcode_report):
+def beauty_report(barcode_report, fa):
+    df_len = get_len(fa)
     df = pd.read_csv(barcode_report, sep='\t')
     rows = df.shape[0]
     chains = ['chain2', 'chain1']
@@ -16,7 +30,7 @@ def beauty_report(barcode_report):
     for l in range(len(chains)):
         chain = chains[l]
 
-        items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'full_length_assembly': -1}        
+        items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'contig_id': -3, 'full_length_assembly': -1}        
 
         for i in range(rows):
             cb = df.loc[i, '#barcode']
@@ -33,9 +47,11 @@ def beauty_report(barcode_report):
 
     res = pd.DataFrame(dic, columns=list(dic.keys()))
 
-    return res
+    df_res = pd.merge(res, df_len, on='contig_id', how='inner')
 
+    return df_res
 
+@utils.add_log
 def get_clone_table(df, Seqtype):
     res_filter_summary = []
 
@@ -46,7 +62,7 @@ def get_clone_table(df, Seqtype):
         paired_groups = ['TRA_TRB']
     if Seqtype == 'BCR':
         chains = ['IGH', 'IGL', 'IGK']
-        paired_groups = ['IGH_IHL', 'IGH_IGK']
+        paired_groups = ['IGH_IGL', 'IGH_IGK']
     for chain in chains:
         tmp = df[df['V'].str.contains(chain, na=False)]
         tmp = tmp.set_index('barcode')
@@ -126,11 +142,12 @@ class Res_filter(Step):
     @utils.add_log
     def run(self):
         barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
-        df = beauty_report(barcode_report)
+        fa = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_annot.fa'
+        df = beauty_report(barcode_report, fa)
 
         if self.full_length:
             df = df[df['full_length_assembly']=='1']
-        df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t')
+        df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t', index=False)
 
         clones, res_filter_summary = get_clone_table(df, self.Seqtype)
 
diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py
index e052d0ae..8cf8f76a 100644
--- a/celescope/trust_vdj/trust_assemble.py
+++ b/celescope/trust_vdj/trust_assemble.py
@@ -47,6 +47,21 @@ def match_barcodes(outdir, match_dir, Seqtype, fq1):
         seqlist.write(str(name) + '\n')
 
 
+def clean_fq(fq1, fq2, outdir, sample, species):
+
+    prefix = f'{outdir}/{sample}_clean'
+
+    cmd = (
+        f'/SGRNJ03/randd/zhouxin/software/TRUST4/fastq-extractor '
+        f'-t 10 -f /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa '
+        f'-o {prefix} --barcodeStart 0 --barcodeEnd 23 '
+        f'-u {fq2} '
+        f'--barcode {fq1}'
+    )
+
+    os.system(cmd)
+
+
 def mapping_summary(outdir, Seqtype, fq, species):
     
     stat_file = outdir + '/stat.txt'
@@ -93,15 +108,32 @@ def mapping_summary(outdir, Seqtype, fq, species):
                         'count': count,
                         'total_count': total_count,
                     })
+       # os.system(f'rm {outdir}/{locus}.sam')   
 
-        os.system(f'rm {outdir}/{locus}.sam')
-
-    trust_assemble_summary.insert(0, {
-        'item': stat_string,
-        'count': total_mapped,
-        'total_count': total_count
-    })
-
+    # total mapping
+    cmd = (
+            f'source activate bracer; '
+            f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+            f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} '
+            f'-U {fq} '
+            f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1'        
+    )
+    os.system(cmd)
+    with open(f'{outdir}/log') as fh: 
+        for line in fh:
+            if 'reads; of these:' in line:
+                attr = re.findall(r'\d+', line)
+                total_count = int(attr[0])
+            if 'aligned exactly 1 time' in line:
+                res = re.findall(r"\d+", line)
+                count = int(res[0])
+                trust_assemble_summary.insert(0, {
+                    'item': stat_string,
+                    'count': count,
+                    'total_count': total_count,
+                })
+
+    os.system(f'rm {outdir}/*.sam')
     os.system(f'rm {outdir}/log')
 
     df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count'])
@@ -147,7 +179,8 @@ class Trust_assemble(Step):
     @utils.add_log
     def run(self):
 
-        self.getFqfile()
+        if not os.path.exists(f'{self.outdir}/{self.sample}_matched_R2.fq'):
+            self.getFqfile()
 
         species = self.species
 
@@ -175,7 +208,12 @@ class Trust_assemble(Step):
 
             #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
 
-        mapping_summary(self.outdir, self.Seqtype, self.fq2, species)
+        # report
+        clean_fq(self.fq1, self.fq2, self.outdir, self.sample, species)
+
+        fq = f'{self.outdir}/{self.sample}_clean.fq'
+
+        mapping_summary(self.outdir, self.Seqtype, fq, species)
 
         os.remove(f'{self.outdir}/seqlist.txt')
 
-- 
Gitee


From 7444c8e3edf6d4f6fd050713e6de555f51d8156b Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Thu, 24 Jun 2021 11:39:26 +0800
Subject: [PATCH 80/96] add bowtie2 and plotly==4.14.3

---
 conda_pkgs.txt | 3 ++-
 setup.py       | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/conda_pkgs.txt b/conda_pkgs.txt
index 22f5fab0..99567356 100755
--- a/conda_pkgs.txt
+++ b/conda_pkgs.txt
@@ -11,4 +11,5 @@ r-tidyverse
 mixcr=3.0.3
 bioconductor-dropletutils
 bcftools==1.9
-seqkt
\ No newline at end of file
+seqkt
+bowtie2
\ No newline at end of file
diff --git a/setup.py b/setup.py
index f6256a90..a23429ce 100755
--- a/setup.py
+++ b/setup.py
@@ -41,6 +41,6 @@ setuptools.setup(
         'editdistance>=0.5.3',
         'mutract',
         'sklearn',
-        'plotly',
+        'plotly==4.14.3',
     ]
 )
-- 
Gitee


From a2b5d20e810b55a864f29542b388e4e33ca2b034 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 16:40:58 +0800
Subject: [PATCH 81/96] add scripts

---
 celescope/scripts/gene_umi_summary.R |  28 +++++++
 celescope/scripts/mt_summary.py      | 106 +++++++++++++++++++++++++++
 2 files changed, 134 insertions(+)
 create mode 100644 celescope/scripts/gene_umi_summary.R
 create mode 100644 celescope/scripts/mt_summary.py

diff --git a/celescope/scripts/gene_umi_summary.R b/celescope/scripts/gene_umi_summary.R
new file mode 100644
index 00000000..60e93751
--- /dev/null
+++ b/celescope/scripts/gene_umi_summary.R
@@ -0,0 +1,28 @@
+library(Seurat)
+library(tidyverse)
+library(argparser)
+
+argv <- arg_parser('')
+argv <- add_argument(argv,"--matrix_dir", help="")
+argv <- add_argument(argv,"--outdir", help="")
+argv <- add_argument(argv,"--sample", help="")
+argv <- add_argument(argv,"--mt_gene_list_file", help="")
+argv <- parse_args(argv)
+
+matrix_dir = argv$matrix_dir
+outdir = argv$outdir
+sample = argv$sample
+mt_gene_list_file = argv$mt_gene_list_file
+
+# out
+df.out = str_glue("{outdir}/{sample}_MT_UMI.tsv")
+
+mtx = Read10X(matrix_dir)
+mt_gene_list = read.table(mt_gene_list_file)[,1]
+
+gene_valid = rownames(mtx)
+gene_intersect = intersect(gene_valid, mt_gene_list)
+cells = dim(mtx)[2]
+mean_UMI = sort(round(rowSums(mtx[gene_intersect,]) / cells,3), decreasing = T)
+df = as.data.frame(mean_UMI)
+write.table(df, df.out, sep='\t', col.names=NA)
diff --git a/celescope/scripts/mt_summary.py b/celescope/scripts/mt_summary.py
new file mode 100644
index 00000000..20019d02
--- /dev/null
+++ b/celescope/scripts/mt_summary.py
@@ -0,0 +1,106 @@
+import argparse
+import glob
+import os
+import subprocess
+
+import pandas as pd
+from plotnine import ggplot, aes, geom_line
+
+from celescope.celescope import ArgFormatter
+from celescope.__init__ import HELP_DICT, ROOT_PATH
+from celescope.rna.mkref import parse_genomeDir_rna
+import celescope.tools.utils as utils
+
+SAMPLE_COL_INDEX = 2
+
+def parse_mapfile(mapfile):
+    sample_set = set()
+    df_mapfile = pd.read_csv(mapfile, sep='\t', header=None)
+
+    def read_row(row):
+        sample = row[SAMPLE_COL_INDEX]
+        sample_set.add(sample)
+
+    df_mapfile.apply(read_row, axis=1)
+    return sample_set
+
+
+class Mt_summary():
+    def __init__(self, sample, outdir, genomeDir):
+        self.sample = sample
+        self.outdir = outdir
+
+        # set
+        self.mt_gene_list_file = parse_genomeDir_rna(genomeDir)['mt_gene_list']
+        self.featureCounts_bam = None
+        try:
+            self.featureCounts_bam = glob.glob(f'{sample}/*featureCounts/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam')[0]
+        except IndexError:
+            print("featureCounts bam does not exist! Skip coverage summary.")
+
+        self.matrix_dir = glob.glob(f'{sample}/*count/{sample}_matrix_10X')[0]
+
+        # out
+        if not os.path.exists(outdir):
+            os.system(f'mkdir -p {outdir}')
+        out_prefix = f'{outdir}/{sample}'
+        self.mt_bam = f'{out_prefix}_mt.bam'
+        self.mt_depth = f'{out_prefix}_mt_depth.tsv'
+        self.coverage_plot = f'{out_prefix}_mt_coverage.png'
+
+    @utils.add_log
+    def samtools(self):
+        cmd = (
+            f'samtools index {self.featureCounts_bam};'
+            f'samtools view -b {self.featureCounts_bam} MT -o {self.mt_bam};'
+            f'samtools depth -a {self.mt_bam} > {self.mt_depth}'
+        )
+        self.samtools.logger.info(cmd)
+        subprocess.check_call(cmd, shell=True)
+
+    @utils.add_log
+    def umi_summary(self):
+        cmd = (
+            f'Rscript {ROOT_PATH}/scripts/gene_umi_summary.R '
+            f'--sample {self.sample} '
+            f'--outdir {self.outdir} '
+            f'--mt_gene_list_file {self.mt_gene_list_file} '
+            f'--matrix_dir {self.matrix_dir} '
+        )
+        self.umi_summary.logger.info(cmd)
+        subprocess.check_call(cmd, shell=True)
+
+    @utils.add_log
+    def coverage_summary(self):
+        self.samtools()
+        df = pd.read_csv(self.mt_depth, sep='\t', header=None)
+        df.columns = ["MT", "position", "read_count"]
+        plot = ggplot(df, aes(x="position",y="read_count")) + geom_line()
+        plot.save(self.coverage_plot)
+
+    @utils.add_log
+    def run(self):
+        if self.featureCounts_bam:
+            self.umi_summary()
+        self.coverage_summary()
+
+
+def main():
+    parser = argparse.ArgumentParser(description='plot snp', formatter_class=ArgFormatter)
+    parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True)
+    parser.add_argument("--genomeDir", help=HELP_DICT["genomeDir"], 
+        default='/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92')
+    parser.add_argument("--outdir", help="output dir", default='mt_summary')
+    args = parser.parse_args()
+
+    sample_set = parse_mapfile(args.mapfile)
+    for sample in sample_set:
+        runner = Mt_summary(
+            sample=sample,
+            outdir=args.outdir,
+            genomeDir=args.genomeDir,
+        )
+        runner.run()
+
+if __name__ == '__main__':
+    main()
\ No newline at end of file
-- 
Gitee


From 10fe5f9e3e500704e7c1241c30ff4535f6c42709 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Thu, 24 Jun 2021 17:57:49 +0800
Subject: [PATCH 82/96] mean read per cell

---
 celescope/scripts/gene_umi_summary.R |  4 ++--
 celescope/scripts/mt_summary.py      | 27 ++++++++++++++++-----------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/celescope/scripts/gene_umi_summary.R b/celescope/scripts/gene_umi_summary.R
index 60e93751..d6f2a33c 100644
--- a/celescope/scripts/gene_umi_summary.R
+++ b/celescope/scripts/gene_umi_summary.R
@@ -15,7 +15,7 @@ sample = argv$sample
 mt_gene_list_file = argv$mt_gene_list_file
 
 # out
-df.out = str_glue("{outdir}/{sample}_MT_UMI.tsv")
+df.out = str_glue("{outdir}/{sample}_mt_UMI.tsv")
 
 mtx = Read10X(matrix_dir)
 mt_gene_list = read.table(mt_gene_list_file)[,1]
@@ -25,4 +25,4 @@ gene_intersect = intersect(gene_valid, mt_gene_list)
 cells = dim(mtx)[2]
 mean_UMI = sort(round(rowSums(mtx[gene_intersect,]) / cells,3), decreasing = T)
 df = as.data.frame(mean_UMI)
-write.table(df, df.out, sep='\t', col.names=NA)
+write.table(df, df.out, sep='\t', col.names=NA, quote = F)
diff --git a/celescope/scripts/mt_summary.py b/celescope/scripts/mt_summary.py
index 20019d02..fd5b8e67 100644
--- a/celescope/scripts/mt_summary.py
+++ b/celescope/scripts/mt_summary.py
@@ -26,19 +26,22 @@ def parse_mapfile(mapfile):
 
 
 class Mt_summary():
-    def __init__(self, sample, outdir, genomeDir):
+    def __init__(self, sample, outdir, genomeDir, root_dir):
         self.sample = sample
         self.outdir = outdir
 
         # set
+        match_dir = f'{root_dir}/{sample}'
         self.mt_gene_list_file = parse_genomeDir_rna(genomeDir)['mt_gene_list']
-        self.featureCounts_bam = None
+        _barcodes, self.ncell = utils.read_barcode_file(match_dir)
+        self.bam = None
         try:
-            self.featureCounts_bam = glob.glob(f'{sample}/*featureCounts/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam')[0]
+            self.bam = glob.glob(
+                f'{match_dir}/03*/{sample}*sortedByCoord.out.bam')[0]
         except IndexError:
-            print("featureCounts bam does not exist! Skip coverage summary.")
+            print("STAR bam does not exist! Skip coverage summary.")
 
-        self.matrix_dir = glob.glob(f'{sample}/*count/{sample}_matrix_10X')[0]
+        self.matrix_dir = glob.glob(f'{match_dir}/*count/{sample}_matrix_10X')[0]
 
         # out
         if not os.path.exists(outdir):
@@ -51,8 +54,7 @@ class Mt_summary():
     @utils.add_log
     def samtools(self):
         cmd = (
-            f'samtools index {self.featureCounts_bam};'
-            f'samtools view -b {self.featureCounts_bam} MT -o {self.mt_bam};'
+            f'samtools view -b {self.bam} MT -o {self.mt_bam};'
             f'samtools depth -a {self.mt_bam} > {self.mt_depth}'
         )
         self.samtools.logger.info(cmd)
@@ -75,14 +77,15 @@ class Mt_summary():
         self.samtools()
         df = pd.read_csv(self.mt_depth, sep='\t', header=None)
         df.columns = ["MT", "position", "read_count"]
-        plot = ggplot(df, aes(x="position",y="read_count")) + geom_line()
+        df["mean_read_count_per_cell"] = df["read_count"].apply(lambda x: x / self.ncell)
+        plot = ggplot(df, aes(x="position", y="mean_read_count_per_cell")) + geom_line()
         plot.save(self.coverage_plot)
 
     @utils.add_log
     def run(self):
-        if self.featureCounts_bam:
-            self.umi_summary()
-        self.coverage_summary()
+        self.umi_summary()
+        if self.bam:
+            self.coverage_summary()
 
 
 def main():
@@ -90,6 +93,7 @@ def main():
     parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True)
     parser.add_argument("--genomeDir", help=HELP_DICT["genomeDir"], 
         default='/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92')
+    parser.add_argument("--root_dir", help='input root_dir', default='./')
     parser.add_argument("--outdir", help="output dir", default='mt_summary')
     args = parser.parse_args()
 
@@ -99,6 +103,7 @@ def main():
             sample=sample,
             outdir=args.outdir,
             genomeDir=args.genomeDir,
+            root_dir=args.root_dir,
         )
         runner.run()
 
-- 
Gitee


From 1e3feab042f2a6948de7f52841c9ab38ccfc3b26 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 09:57:59 +0800
Subject: [PATCH 83/96] update

---
 celescope/__init__.py    |  2 ++
 celescope/snp/mkref.py   | 19 +++++++++++++++---
 celescope/tools/multi.py | 43 +++++++++++++++++++++-------------------
 celescope/tools/step.py  |  6 +++---
 docs/snp/mkref.md        |  8 ++++++++
 generate_docs.py         |  4 ++--
 6 files changed, 54 insertions(+), 28 deletions(-)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index d71b3352..e89d5496 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -27,4 +27,6 @@ HELP_DICT = {
     'match_dir': 'Match celescope scRNA-Seq directory.',
     'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.',
     'genomeDir': 'Genome directory after running `mkref`.',
+    'thread': 'Thread to use.',
+    'debug': 'If this argument is used, celescope may output addtional file for debugging.',
 }
diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py
index b5632a9f..32fd2dd0 100644
--- a/celescope/snp/mkref.py
+++ b/celescope/snp/mkref.py
@@ -9,9 +9,22 @@ from celescope.tools.mkref import get_opts_mkref as opts
 
 class Mkref_snp(Mkref):
     """
-    https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format
-    Create dictionary file and fasta index for gatk SplitNCigarReads.
-    Need to build on top of a rna genome.
+    Features
+    - Create dictionary file and fasta index for gatk SplitNCigarReads.
+    (https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) 
+    Need to run `celescope rna mkref` first
+
+    Output
+    - fasta index
+    - gatk dictionary file
+
+    Usage
+    ```
+    # run celescope rna mkref first
+    celescope snp mkref \
+     --genome_name Homo_sapiens_ensembl_99 \
+     --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa
+    ```
     """
 
     def __init__(self, genome_type, args):
diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index 8b3351ff..4df0d7e8 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -7,6 +7,7 @@ from collections import defaultdict
 import celescope
 import celescope.tools.utils as utils
 from celescope.celescope import ArgFormatter
+from celescope.__init__ import HELP_DICT
 
 TOOLS_DIR = os.path.dirname(celescope.tools.__file__)
 
@@ -21,7 +22,6 @@ class Multi():
         self.__APP__ = 'celescope'
         self.col4_default = None
         self.last_step = ''
-        self.args = None
         self.steps_not_run = ['mkref']
 
         # remove
@@ -42,7 +42,20 @@ class Multi():
         elif self.args.steps_run:
             self.steps_run = self.args.steps_run.strip().split(',')
 
-        self.prepare()
+        # init
+        self.fq_dict = {}
+        self.col4_dict = {}
+        self.col5_dict = {}
+        self.logdir = self.args.outdir + '/log'
+
+        # script init
+        self.sjm_cmd = f'log_dir {self.logdir}\n'
+        self.sjm_order = ''
+        self.shell_dict = defaultdict(str)
+
+        # outdir dict
+        self.outdir_dic = {}
+
 
     def common_args(self):
         readme = f'{self.__ASSAY__} multi-samples'
@@ -57,15 +70,15 @@ class Multi():
                 1st col: LibName;
                 2nd col: DataDir;
                 3rd col: SampleName;
-                4th col: Cell number or match_dir, optional;
+                4th col: optional;
             ''',
             required=True)
         parser.add_argument('--rm_files', action='store_true', help='remove redundant fq.gz and bam after running')
-        parser.add_argument('--steps_run', help='steps to run', default='all')
+        parser.add_argument('--steps_run', help='Steps to run. Multiple Steps are separated by comma.', default='all')
         # sub_program parser do not have
-        parser.add_argument('--outdir', help='output dir', default="./")
-        parser.add_argument('--debug', help='debug or not', action='store_true')
-        parser.add_argument('--thread', help='thread', default=4)
+        parser.add_argument('--outdir', help='Output directory.', default="./")
+        parser.add_argument('--thread', help=HELP_DICT['thread'], default=4)
+        parser.add_argument('--debug', help=HELP_DICT['debug'], action='store_true')
         self.parser = parser
         return parser
 
@@ -122,26 +135,15 @@ class Multi():
 
     def prepare(self):
         """
-        parse_mapfile, link data, make log dir, init script variables, init outdir_dic
+        parse_mapfile, make log dir, init script variables, init outdir_dic
         """
         # parse_mapfile
         self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default)
 
-        # link
-        self.link_data()
-
         # mk log dir
-        self.logdir = self.args.outdir + '/log'
         if self.args.mod == 'sjm':
             os.system('mkdir -p %s' % (self.logdir))
 
-        # script init
-        self.sjm_cmd = 'log_dir %s\n' % (self.logdir)
-        self.sjm_order = ''
-        self.shell_dict = defaultdict(str)
-
-        # outdir dict
-        self.outdir_dic = {}
         for sample in self.fq_dict:
             self.outdir_dic[sample] = {}
             index = 0
@@ -190,7 +192,7 @@ job_end
             f'--thread {self.args.thread} '
         )
         cmd_line = step_prefix
-        if self.args.debug or self.__CONDA__ == "celescope_RD":
+        if self.args.debug:
             cmd_line += " --debug "
         for arg in args_dict:
             if args_dict[arg] is False:
@@ -335,6 +337,7 @@ job_end
                     f.write(self.shell_dict[sample])
 
     def run(self):
+        self.prepare()
         self.run_steps()
         self.end()
 
diff --git a/celescope/tools/step.py b/celescope/tools/step.py
index 39e75b59..7f3224b6 100755
--- a/celescope/tools/step.py
+++ b/celescope/tools/step.py
@@ -10,6 +10,7 @@ import pandas as pd
 from jinja2 import Environment, FileSystemLoader, select_autoescape
 
 from celescope.tools.utils import add_log
+from celescope.__init__ import HELP_DICT
 
 
 Metric = namedtuple("Metric", "name value total fraction")
@@ -21,9 +22,8 @@ def s_common(parser):
     parser.add_argument('--outdir', help='Output diretory.', required=True)
     parser.add_argument('--assay', help='Assay name.', required=True)
     parser.add_argument('--sample', help='Sample name.', required=True)
-    parser.add_argument('--thread', help='Thread to use.', default=4)
-    parser.add_argument(
-        '--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true')
+    parser.add_argument('--thread', help=HELP_DICT['thread'], default=4)
+    parser.add_argument('--debug', help=HELP_DICT['debug'], action='store_true')
     return parser
 
 
diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md
index 7f12cc92..b78d34f8 100644
--- a/docs/snp/mkref.md
+++ b/docs/snp/mkref.md
@@ -1,3 +1,11 @@
+## Features
+- Create dictionary file and fasta index for gatk SplitNCigarReads.
+(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) 
+Need to run `celescope rna mkref` first
+
+## Output
+- fasta index
+- gatk dictionary file
 
 
 ## Arguments
diff --git a/generate_docs.py b/generate_docs.py
index fc6b0845..a43105a0 100644
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -36,7 +36,7 @@ def generate_single_step_doc(assay, step):
 
 def get_argument_docs(func_opts):
     argument_docs = ""
-    parser = argparse.ArgumentParser(description='CeleScope',formatter_class=ArgFormatter)
+    parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter)
     func_opts(parser, sub_program=True)
     for argument in parser._option_string_actions:
         if not argument in ['-h', '--help']:
@@ -49,7 +49,7 @@ def get_argument_docs(func_opts):
 
 
 def get_class_docs(step_module):
-    titles = ("Features", "Output")
+    titles = ("Features", "Output", "Usage")
     class_docs = ""
     for child in inspect.getmembers(step_module, inspect.isclass):
         """Filter out class not defined in step_module"""
-- 
Gitee


From 1c720f00e15377f4161a8b6531d8bbfc0dc4443d Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 11:21:18 +0800
Subject: [PATCH 84/96] vid single int

---
 celescope/snp/utils/plot_vid.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py
index 5211d01a..20efb76f 100644
--- a/celescope/snp/utils/plot_vid.py
+++ b/celescope/snp/utils/plot_vid.py
@@ -24,7 +24,7 @@ def parse_mapfile(mapfile):
     def read_row(row):
         sample = row[SAMPLE_COL_INDEX]
         match_dir = row[MATCH_DIR_COL_INDEX]
-        vid_list = [int(vid) for vid in row[VID_COL_INDEX].strip().split(',')]
+        vid_list = [int(vid) for vid in str(row[VID_COL_INDEX]).strip().split(',')]
         sample_vid_dict[sample] = vid_list
         sample_match_dir_dict[sample] = match_dir
 
-- 
Gitee


From 74ecaa0f3a33dc11dfe193ca99d738a29dd84b19 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 13:14:56 +0800
Subject: [PATCH 85/96] refactor generate docs

---
 docs/CHANGELOG.md                            | 178 -------------------
 docs/CONTRIBUTING.md                         | 106 -----------
 docs/capture_rna/analysis.md                 |  21 ---
 docs/capture_rna/count_capture_rna.md        |  61 -------
 docs/capture_rna/featureCounts.md            |  19 --
 docs/capture_rna/sample.md                   |  17 --
 docs/capture_rna/star.md                     |  56 ------
 docs/capture_virus/analysis_capture_virus.md |  19 --
 docs/capture_virus/consensus.md              |  19 --
 docs/capture_virus/count_capture_virus.md    |  19 --
 docs/capture_virus/mkref.md                  |  15 --
 docs/capture_virus/sample.md                 |  17 --
 docs/capture_virus/star_virus.md             |  32 ----
 docs/citeseq/analysis_cite.md                |  13 --
 docs/citeseq/count_cite.md                   |  13 --
 docs/citeseq/mapping_tag.md                  |  21 ---
 docs/citeseq/sample.md                       |  17 --
 docs/dynaseq/conversion.md                   |  26 ---
 docs/dynaseq/replace_tsne.md                 |  30 ----
 docs/dynaseq/replacement.md                  |  36 ----
 docs/dynaseq/subsitution.md                  |  20 ---
 docs/fusion/count_fusion.md                  |  23 ---
 docs/fusion/mkref.md                         |  24 ---
 docs/fusion/sample.md                        |  17 --
 docs/fusion/star_fusion.md                   |  32 ----
 docs/hla/mapping_hla.md                      |  15 --
 docs/hla/sample.md                           |  17 --
 docs/manual.md                               |  38 ----
 docs/manual_template.md                      |  23 ---
 docs/methods/rna.txt                         |  10 --
 docs/mut/count_mut.md                        |  17 --
 docs/mut/mapping_mut.md                      |  17 --
 docs/mut/sample.md                           |  17 --
 docs/quick_start.md                          | 110 ------------
 docs/rna/analysis.md                         |  51 ------
 docs/rna/mkref.md                            |  38 ----
 docs/rna/star.md                             |  56 ------
 docs/rna_virus/analysis_rna_virus.md         |  17 --
 docs/rna_virus/count.md                      |  27 ---
 docs/rna_virus/count_virus.md                |  17 --
 docs/rna_virus/featureCounts.md              |  19 --
 docs/rna_virus/sample.md                     |  17 --
 docs/rna_virus/star.md                       |  56 ------
 docs/rna_virus/star_virus.md                 |  32 ----
 docs/snp/analysis_snp.md                     |  23 ---
 docs/snp/mkref.md                            |  21 ---
 docs/snp/variant_calling.md                  |  38 ----
 docs/tag/analysis_tag.md                     |  19 --
 docs/tag/count_tag.md                        |  44 -----
 docs/tag/mapping_tag.md                      |  48 -----
 docs/tag/split_tag.md                        |  26 ---
 docs/tcr_fl/assemble.md                      |  15 --
 docs/tcr_fl/sample.md                        |  17 --
 docs/tcr_fl/split_fq.md                      |  15 --
 docs/tools/barcode.md                        |  61 -------
 docs/tools/consensus.md                      |  24 ---
 docs/tools/count.md                          |  61 -------
 docs/tools/cutadapt.md                       |  44 -----
 docs/tools/featureCounts.md                  |  38 ----
 docs/tools/sample.md                         |  17 --
 docs/tools/target_metrics.md                 |  28 ---
 docs/vdj/count_vdj.md                        |  37 ----
 docs/vdj/mapping_vdj.md                      |  35 ----
 63 files changed, 2056 deletions(-)
 delete mode 100755 docs/CHANGELOG.md
 delete mode 100644 docs/CONTRIBUTING.md
 delete mode 100644 docs/capture_rna/analysis.md
 delete mode 100644 docs/capture_rna/count_capture_rna.md
 delete mode 100644 docs/capture_rna/featureCounts.md
 delete mode 100644 docs/capture_rna/sample.md
 delete mode 100644 docs/capture_rna/star.md
 delete mode 100644 docs/capture_virus/analysis_capture_virus.md
 delete mode 100644 docs/capture_virus/consensus.md
 delete mode 100644 docs/capture_virus/count_capture_virus.md
 delete mode 100644 docs/capture_virus/mkref.md
 delete mode 100644 docs/capture_virus/sample.md
 delete mode 100644 docs/capture_virus/star_virus.md
 delete mode 100644 docs/citeseq/analysis_cite.md
 delete mode 100644 docs/citeseq/count_cite.md
 delete mode 100644 docs/citeseq/mapping_tag.md
 delete mode 100644 docs/citeseq/sample.md
 delete mode 100644 docs/dynaseq/conversion.md
 delete mode 100644 docs/dynaseq/replace_tsne.md
 delete mode 100644 docs/dynaseq/replacement.md
 delete mode 100644 docs/dynaseq/subsitution.md
 delete mode 100644 docs/fusion/count_fusion.md
 delete mode 100644 docs/fusion/mkref.md
 delete mode 100644 docs/fusion/sample.md
 delete mode 100644 docs/fusion/star_fusion.md
 delete mode 100644 docs/hla/mapping_hla.md
 delete mode 100644 docs/hla/sample.md
 delete mode 100755 docs/manual.md
 delete mode 100644 docs/manual_template.md
 delete mode 100755 docs/methods/rna.txt
 delete mode 100644 docs/mut/count_mut.md
 delete mode 100644 docs/mut/mapping_mut.md
 delete mode 100644 docs/mut/sample.md
 delete mode 100755 docs/quick_start.md
 delete mode 100644 docs/rna/analysis.md
 delete mode 100644 docs/rna/mkref.md
 delete mode 100644 docs/rna/star.md
 delete mode 100644 docs/rna_virus/analysis_rna_virus.md
 delete mode 100644 docs/rna_virus/count.md
 delete mode 100644 docs/rna_virus/count_virus.md
 delete mode 100644 docs/rna_virus/featureCounts.md
 delete mode 100644 docs/rna_virus/sample.md
 delete mode 100644 docs/rna_virus/star.md
 delete mode 100644 docs/rna_virus/star_virus.md
 delete mode 100644 docs/snp/analysis_snp.md
 delete mode 100644 docs/snp/mkref.md
 delete mode 100644 docs/snp/variant_calling.md
 delete mode 100644 docs/tag/analysis_tag.md
 delete mode 100644 docs/tag/count_tag.md
 delete mode 100644 docs/tag/mapping_tag.md
 delete mode 100644 docs/tag/split_tag.md
 delete mode 100644 docs/tcr_fl/assemble.md
 delete mode 100644 docs/tcr_fl/sample.md
 delete mode 100644 docs/tcr_fl/split_fq.md
 delete mode 100644 docs/tools/barcode.md
 delete mode 100644 docs/tools/consensus.md
 delete mode 100644 docs/tools/count.md
 delete mode 100644 docs/tools/cutadapt.md
 delete mode 100644 docs/tools/featureCounts.md
 delete mode 100644 docs/tools/sample.md
 delete mode 100644 docs/tools/target_metrics.md
 delete mode 100644 docs/vdj/count_vdj.md
 delete mode 100644 docs/vdj/mapping_vdj.md

diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
deleted file mode 100755
index 109dd6d7..00000000
--- a/docs/CHANGELOG.md
+++ /dev/null
@@ -1,178 +0,0 @@
-# Change Log
-
-## [unreleased] - 2021-06-09
-### Added
-
-### Changed
-
-### Fixed
-- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`.
-
-### Removed
-- `celescope.tools.utils.glob_genomeDir`
-
-## [1.3.1] - 2021-06-09
-### Added
-
-- Add wdl workflow.
-
-- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549.
-
-### Changed
-
-- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered)
-
-### Fixed and Removed
-
-- Remove h5 file generation in R to avoid memory issues.
-
-
-## [1.3.0] - 2021-05-28
- 
-### Added
-
-- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details.
-
-### Changed
-
-- Change the way to handle duplicate gene_name and gene_id in gtf file.
-
-Previous:
-
-    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
-    - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name.
-    - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name.
-
-Now:
-
-    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
-    - one gene_id with multiple gene_name: error.
-    - duplicated (gene_name, gene_id): ignore duplicated records and print a warning.
-
-### Fixed
-
-- Fix `count tag` metrics order in merge.xls
-
-### Removed
-
-- Remove `--fusion_pos` from `celescope.fusion.count_fusion`
-
- 
-## [1.2.0] - 2021-05-19
- 
-### Added
-
-- Assay `rna` outputs .h5 file in 06.analysis directory.
-
-### Changed
-
-- Update Seurat from 2.3.4 to 4.0.1.
-
-- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding.
-
-- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136
-
-### Removed
-
-- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory.
-
- 
-## [1.1.9] - 2021-04-25
- 
-### Added
-
-- Add parameter `--coefficient`  to `celescope tag count_tag` and `multi_tag`
-    
-    Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)`
-
-- Add `.metrics.json`
-
-- Add `scopeV1` chemistry support.
- 
-### Changed
-  
-- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage).
-
-- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage.
-
-- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output.
-
-- Change the display of Barcode-rank plot in html report. 
-
-### Fixed
-
-- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2.
-
-- Fix an error when Numpy >= 1.2.0.
-
-- VDJ merge.xls can display all the metrics correctly.
-
-### Removed
-
-- Remove fastqc from `barcode` step.
-
- 
-## [1.1.8] - 2021-03-26
- 
-### Added
-
-- Add read consensus to VDJ pipeline. 
-
-    A consensus step was added before mapping to merge all the reads of the same
-    (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`.  
-    multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7.   
-
-- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`.
-
-    `--species` can be one of:
-    - `hs`: human
-    - `mmu`: mouse
-
-- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`.
-
-    `--cell_calling_method` can be one of:  
-    - `auto`: Same result as v1.1.7.  
-    - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3.  
-    - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points.  
-
-- Add 4 tags to featureCounts bam.
-
-    - `CB`: cell barcode
-    - `UB`: UMI
-    - `GN`: gene name
-    - `GX`: gene id
-
-- Add `--STAR_param` to `celescope rna STAR`
-
-    Additional parameters of STAR can be passed into the `STAR` step.
-
-### Changed
-
-- One sample can have different chemistry fastq in mapfile.  Version <= v1.1.7 will report this as an error.
-
-- Gtf file can be gzipped.
-
-- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` 
-
-- Step `snpCalling` use mutract.
-
-
-## [1.1.7] - 2020-12-16
-
-### Added
-
-- Automatically detect Singleron chemistry version.
-
-### Changed
-
-- FeatureCounts use strand specificity.
-
-- Cutadapt default `overlap` change from `5` to `10`.
-
-- VDJ sort `NA` last.
-
-- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID.
-
-
-
-
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
deleted file mode 100644
index fb9ee986..00000000
--- a/docs/CONTRIBUTING.md
+++ /dev/null
@@ -1,106 +0,0 @@
-## Pull Requests
-Create pull requests to `dev` branch
-
-## Lint code
-Before pull requests, you should lint your code with the following command:
-```
-pip install pylint
-# lint
-# W1618 (no-absolute-import)
-# E1101 (no-member)
-# W1633 (round-builtin)
-# W1619 (old-division)
-# W0105 (String statement has no effect)
-# W0511 TODO!
-# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type)
-# W0212 Access to a protected member _option_string_actions of a client class (protected-access)
-pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope
-```
-Your code should be rated at 10(i.e. no error or warning). 
-
-## Write a new step
-When you add a new step, you need to
-  - Create a step class which inherit from `celescope.tools.step.Step`. 
-  - Create a function with the same name of the module. The main function `celescope` uses this function to run each step.
-  - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface.
-
-For example, in `celescope.tools.cutadapt`:
-
-```
-from celescope.tools.step import Step, s_common
-import celescope.tools.utils as utils
-
-
-class Cutadapt(Step):
-    """
-    Features
-    - Trim adapters in R2 reads with cutadapt. Default adapters includes:
-	- polyT=A{18}, 18 A bases. 
-	- p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
-
-    Output
-    - `cutadapt.log` Cutadapt output log file.
-    - `{sample}_clean_2.fq.gz` R2 reads file without adapters.
-    """
-
-    def __init__(self, args, step_name):
-        Step.__init__(self, args, step_name)
-        {some init code}
-
-    @utils.add_log
-    def run(self):
-        {some code to run}
-
-
-@utils.add_log
-def cutadapt(args):
-    step_name = "cutadapt"
-    cutadapt_obj = Cutadapt(args, step_name)
-    cutadapt_obj.run()
-
-
-def get_opts_cutadapt(parser, sub_program):
-    parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.')
-    parser.add_argument(
-        '--minimum_length',
-        help='Default `20`. Discard processed reads that are shorter than LENGTH.', 
-        default=20
-    )
-    {other arguments}
-    if sub_program:
-        parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True)
-        parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true')
-        parser = s_common(parser)
-    return parser
-```
-
-## Docs
-There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect:
-- Docstring of the step class. The Docstring should have sections named `Features` and `Output`.
-- Help infomation in `get_opts_{module_name}`
-  
-Released assays will be added to `manual.md`.
-
-## Tests
-If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example:
-
-
-1. Get test data
-```
-# If you have access to Singleron Nanjing HPC
-copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir}
-# Or clone from repo
-git clone https://github.com/singleron-RD/celescope_tests.git
-```
-
-2. Run `pytest`
-```
-Install pytest
->>> pip install pytest
-Run all
->>> pytest -s ./tests/test_multi.py --test_dir {test_dir}
-Run some tests
->>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag
-```
-
-Then you need to create your own test based on this example.
\ No newline at end of file
diff --git a/docs/capture_rna/analysis.md b/docs/capture_rna/analysis.md
deleted file mode 100644
index 61a0265f..00000000
--- a/docs/capture_rna/analysis.md
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir.
-
-`--assay` assay.
-
-`--sample` sample name.
-
-`--thread` None
-
-`--debug` debug.
-
-`--matrix_file` matrix file.
-
-`--genomeDir` genomeDir.
-
-`--save_rds` write rds to disk.
-
-`--type_marker_tsv` cell type marker tsv.
-
diff --git a/docs/capture_rna/count_capture_rna.md b/docs/capture_rna/count_capture_rna.md
deleted file mode 100644
index 102eb020..00000000
--- a/docs/capture_rna/count_capture_rna.md
+++ /dev/null
@@ -1,61 +0,0 @@
-## Features
-- Cell-calling: Distinguish cell barcodes from background barcodes. 
-
-- Generate expression matrix.
-
-## Output
-- `{sample}_all_matrix` The expression matrix of all detected barcodes. 
-    Can be read in by calling the `Seurat::Read10X` function.
-
-- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. 
-Can be read in by calling the `Seurat::Read10X` function.
-
-- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. 
-CeleScope >=1.2.0 does not output this file.
-
-- `{sample}_count_detail.txt.gz` 4 columns: 
-    - barcode  
-    - gene ID  
-    - UMI count  
-    - read_count  
-
-- `{sample}_counts.txt` 6 columns:
-    - Barcode: barcode sequence
-    - readcount: read count of each barcode
-    - UMI2: UMI count (with reads per UMI >= 2) for each barcode
-    - UMI: UMI count for each barcode
-    - geneID: gene count for each barcode
-    - mark: cell barcode or backgound barcode.
-
-        `CB` cell  
-        `UB` background  
-
-- `{sample}_downsample.txt` 3 columns：
-    - percent: percentage of sampled reads
-    - median_geneNum: median gene number per cell
-    - saturation: sequencing saturation
-
-- `barcode_filter_magnitude.pdf` Barcode-UMI plot.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--expected_cell_num` Default `3000`. Expected cell number.
-
-`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--bam` Required. BAM file from featureCounts.
-
-`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%.
-
diff --git a/docs/capture_rna/featureCounts.md b/docs/capture_rna/featureCounts.md
deleted file mode 100644
index 61e7f284..00000000
--- a/docs/capture_rna/featureCounts.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-## Arguments
-`--gtf_type` Specify feature type in GTF annotation.
-
-`--genomeDir` None
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--input` None
-
diff --git a/docs/capture_rna/sample.md b/docs/capture_rna/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/capture_rna/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/capture_rna/star.md b/docs/capture_rna/star.md
deleted file mode 100644
index c0d71407..00000000
--- a/docs/capture_rna/star.md
+++ /dev/null
@@ -1,56 +0,0 @@
-## Features
-- Align R2 reads to the reference genome with STAR.
-- Collect Metrics with Picard.
-
-## ## Output
-- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
-
-- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
-
-- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
-This is most useful for troubleshooting and debugging.
-
-- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
-% of mapped reads etc. It is updated in 1 minute intervals.
-
-- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
-very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
-then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
-(unlike the samtools agstat/idxstats, which count each mate separately). 
-Most of the information is collected about the UNIQUE mappers 
-(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
-Each splicing is counted in the numbers of splices, which would correspond to 
-summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
-i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
-
-- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads.
-
-`--STAR_param` Other STAR parameters.
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/capture_virus/analysis_capture_virus.md b/docs/capture_virus/analysis_capture_virus.md
deleted file mode 100644
index 01aeb0c1..00000000
--- a/docs/capture_virus/analysis_capture_virus.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-## Arguments
-`--umi_threshold` method to find virus UMI threshold
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--match_dir` match_dir
-
-`--virus_file` virus UMI count file
-
diff --git a/docs/capture_virus/consensus.md b/docs/capture_virus/consensus.md
deleted file mode 100644
index f09fdd07..00000000
--- a/docs/capture_virus/consensus.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-## Arguments
-`--threshold` valid base threshold.
-
-`--not_consensus` input fastq is not consensus.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq` None
-
diff --git a/docs/capture_virus/count_capture_virus.md b/docs/capture_virus/count_capture_virus.md
deleted file mode 100644
index 2789114d..00000000
--- a/docs/capture_virus/count_capture_virus.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-## Arguments
-`--min_query_length` minimum query length
-
-`--outdir` output dir
-
-`--assay` assay
-
-`--sample` sample name
-
-`--thread` None
-
-`--debug` debug
-
-`--match_dir` matched rna_virus directory
-
-`--virus_bam` None
-
diff --git a/docs/capture_virus/mkref.md b/docs/capture_virus/mkref.md
deleted file mode 100644
index a3627da9..00000000
--- a/docs/capture_virus/mkref.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-## Arguments
-`--genomeDir` Default='./'. Output directory.
-
-`--thread` Default=6. Threads to use.
-
-`--genome_name` Required, genome name.
-
-`--dry_run` Only write config file and exit.
-
-`--fasta` virus fasta file
-
-`--genomeSAindexNbases` STAR genomeSAindexNbases
-
diff --git a/docs/capture_virus/sample.md b/docs/capture_virus/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/capture_virus/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/capture_virus/star_virus.md b/docs/capture_virus/star_virus.md
deleted file mode 100644
index dba66990..00000000
--- a/docs/capture_virus/star_virus.md
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads.
-
-`--STAR_param` Other STAR parameters.
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--virus_genomeDir` virus genome dir.
-
diff --git a/docs/citeseq/analysis_cite.md b/docs/citeseq/analysis_cite.md
deleted file mode 100644
index df9b21da..00000000
--- a/docs/citeseq/analysis_cite.md
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--sample` sample name
-
-`--match_dir` match_dir
-
-`--citeseq_mtx` citeseq matrix .gz file
-
-`--assay` assay
-
diff --git a/docs/citeseq/count_cite.md b/docs/citeseq/count_cite.md
deleted file mode 100644
index 55cbeaf4..00000000
--- a/docs/citeseq/count_cite.md
+++ /dev/null
@@ -1,13 +0,0 @@
-
-
-## Arguments
-`--match_dir` matched scRNA-Seq CeleScope directory path
-
-`--outdir` output dir
-
-`--sample` sample name
-
-`--assay` assay
-
-`--read_count_file` tag read count file
-
diff --git a/docs/citeseq/mapping_tag.md b/docs/citeseq/mapping_tag.md
deleted file mode 100644
index 0b77fe0a..00000000
--- a/docs/citeseq/mapping_tag.md
+++ /dev/null
@@ -1,21 +0,0 @@
-
-
-## Arguments
-`--fq_pattern` read2 fastq pattern.
-
-`--barcode_fasta` barcode fasta.
-
-`--linker_fasta` linker fasta.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq` clean read2.
-
diff --git a/docs/citeseq/sample.md b/docs/citeseq/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/citeseq/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md
deleted file mode 100644
index bfb0cb2a..00000000
--- a/docs/dynaseq/conversion.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## Features
-- Get conversion pos in each read.
-    - Get snp info. 
-
-## Output
-- `{sample}.PosTag.bam` Bam file with conversion info.
-- `{sample}.PosTag.csv` SNP info in csv format.
-
-
-## Arguments
-`--strand` gene strand file
-
-`--bam` featureCount bam
-
-`--cell` barcode cell list
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md
deleted file mode 100644
index 31ed90ce..00000000
--- a/docs/dynaseq/replace_tsne.md
+++ /dev/null
@@ -1,30 +0,0 @@
-## Features
-- Replace rate in each cluster
-- Top replace genes in each cluster
-
-## Output
-- `{sample}.rep_in_tsne.txt` Replace rate in each cluster.
-- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster.
-
-
-## Arguments
-`--tsne` tsne file
-
-`--mat` matrix rep file
-
-`--rep` cell rep file
-
-`--mincell` turn-over in at least cells, default 5
-
-`--topgene` top N genes,default 10
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md
deleted file mode 100644
index 1184777c..00000000
--- a/docs/dynaseq/replacement.md
+++ /dev/null
@@ -1,36 +0,0 @@
-## Features
-- Computes the replacement rates in each cell and gene.
-- Boxplots for rates distribution.
-
-## Output
-- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi.
-- `{sample}.new_matrix.tsv.gz` New RNA matrix.
-- `{sample}.old_matrix.tsv.gz` Old RNA matrix.
-- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell.
-- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene.
-- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene.
-
-
-## Arguments
-`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format
-
-`--bam` bam file
-
-`--bg` background snp file
-
-`--cell_keep` filter cell
-
-`--min_cell` a gene expressed in at least cells, default 10
-
-`--min_gene` at least gene num in a cell, default 10
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md
deleted file mode 100644
index e2b7b169..00000000
--- a/docs/dynaseq/subsitution.md
+++ /dev/null
@@ -1,20 +0,0 @@
-## Features
-- Computes the overall conversion rates in reads and plots a barplot.
-
-## Output
-- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates.
-
-
-## Arguments
-`--bam` bam file
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/fusion/count_fusion.md b/docs/fusion/count_fusion.md
deleted file mode 100644
index 39063118..00000000
--- a/docs/fusion/count_fusion.md
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--bam` None
-
-`--match_dir` match scRNA-Seq dir
-
-`--fusion_genomeDir` fusion genome directory
-
-`--flanking_base` None
-
-`--UMI_min` None
-
diff --git a/docs/fusion/mkref.md b/docs/fusion/mkref.md
deleted file mode 100644
index 24ca414f..00000000
--- a/docs/fusion/mkref.md
+++ /dev/null
@@ -1,24 +0,0 @@
-
-
-## Arguments
-`--genomeDir` Default='./'. Output directory.
-
-`--thread` Default=6. Threads to use.
-
-`--genome_name` Required, genome name.
-
-`--dry_run` Only write config file and exit.
-
-`--fasta` fusion fasta file
-
-`--fusion_pos` fusion position file. A two column tab-delimited text file with header.
-"pos" is the end postion of the first gene(1-based).
-e.g.  
-tag	pos  
-PML_3	183  
-PML_4	254  
-PML_5	326  
-PML_6	204
-
-`--genomeSAindexNbases` STAR genomeSAindexNbases
-
diff --git a/docs/fusion/sample.md b/docs/fusion/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/fusion/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/fusion/star_fusion.md b/docs/fusion/star_fusion.md
deleted file mode 100644
index 4ac60c75..00000000
--- a/docs/fusion/star_fusion.md
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads
-
-`--STAR_param` Other STAR parameters
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fusion_genomeDir` fusion gene STAR index genome directory
-
diff --git a/docs/hla/mapping_hla.md b/docs/hla/mapping_hla.md
deleted file mode 100644
index d4f5f1e3..00000000
--- a/docs/hla/mapping_hla.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--sample` sample name
-
-`--fq` None
-
-`--assay` assay
-
-`--match_dir` match scRNA-Seq dir
-
-`--thread` number of thread
-
diff --git a/docs/hla/sample.md b/docs/hla/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/hla/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/manual.md b/docs/manual.md
deleted file mode 100755
index e9caf430..00000000
--- a/docs/manual.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## Introduction
-CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria.
-
-Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming.
-
-Currently, CeleScope includes the follwing pipelines:
-
-- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional).
-
-- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting.
-
-- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification.
-
-
-## [Quick start](quick_start.md)
-
-## [Change log](CHANGELOG.md)
-
-## Pre-processing
-
-- [barcode](tools/barcode.md)
-- [cutadapt](tools/cutadapt.md)
-
-## Single-cell rna
-- [mkref](rna/mkref.md)
-- [star](rna/star.md)
-- [featureCounts](tools/featureCounts.md)
-- [count](tools/count.md)
-- [analysis](rna/analysis.md)
-## Single-cell vdj
-- [consensus](tools/consensus.md)
-- [mapping_vdj](vdj/mapping_vdj.md)
-- [count_vdj](vdj/count_vdj.md)
-## Single-cell tag
-- [mapping_tag](tag/mapping_tag.md)
-- [count_tag](tag/count_tag.md)
-- [analysis_tag](tag/analysis_tag.md)
-- [split_tag](tag/split_tag.md)
diff --git a/docs/manual_template.md b/docs/manual_template.md
deleted file mode 100644
index c524de94..00000000
--- a/docs/manual_template.md
+++ /dev/null
@@ -1,23 +0,0 @@
-## Introduction
-CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria.
-
-Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming.
-
-Currently, CeleScope includes the follwing pipelines:
-
-- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional).
-
-- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting.
-
-- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification.
-
-
-## [Quick start](quick_start.md)
-
-## [Change log](CHANGELOG.md)
-
-## Pre-processing
-
-- [barcode](tools/barcode.md)
-- [cutadapt](tools/cutadapt.md)
-
diff --git a/docs/methods/rna.txt b/docs/methods/rna.txt
deleted file mode 100755
index 168a02a4..00000000
--- a/docs/methods/rna.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-Single-cell transcriptomics and analysis
-
-Raw reads were processed to generate gene expression profiles using CeleScope v1.2.0(Singleron Biotechnologies) 
-with default parameters. Briefly, Barcodes and UMIs were extracted from R1 reads and corrected. Adapter sequences and poly A tails were trimmed from R2 reads and the trimmed R2 reads were aligned against the {GRCh38 (hg38)} {GRCm38 (mm10)} transcriptome 
-using STAR(v2.6.1b). Uniquely mapped reads were then assigned to exons with FeatureCounts(v2.0.1). Successfully Assigned Reads with the same cell barcode, UMI and gene were grouped together to generate the gene expression matrix for further analysis.
-
-Single-cell analyses were performed using the Seurat package(v4.0.1). All the variable genes selected by the FindVariableFeatures function were used to compute the PCs. The first 20 PCs and a resolution of 0.6 were used for clustering and tSNE visualization.
-
-
-
diff --git a/docs/mut/count_mut.md b/docs/mut/count_mut.md
deleted file mode 100644
index 38699f30..00000000
--- a/docs/mut/count_mut.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--sample` sample name
-
-`--bam` None
-
-`--assay` assay
-
-`--mut_file` mutation file
-
-`--match_dir` match scRNA-Seq dir
-
-`--shift_base` None
-
diff --git a/docs/mut/mapping_mut.md b/docs/mut/mapping_mut.md
deleted file mode 100644
index afccd1b6..00000000
--- a/docs/mut/mapping_mut.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--sample` sample name
-
-`--fq` None
-
-`--assay` assay
-
-`--indel_genomeDir` insertion or deletion STAR indexed genome directory
-
-`--thread` STAR thread
-
-`--outFilterMatchNmin` STAR outFilterMatchNmin
-
diff --git a/docs/mut/sample.md b/docs/mut/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/mut/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/quick_start.md b/docs/quick_start.md
deleted file mode 100755
index 57d2327f..00000000
--- a/docs/quick_start.md
+++ /dev/null
@@ -1,110 +0,0 @@
-# Quick start
-
-CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of:
-
-- rna
-- vdj
-- tag
-
-Run `multi_{assay} -h` for help.
-
-
-## Usage Example
-
-- Single-cell rna
-
-	```
-	conda activate celescope
-	multi_rna\
- 	--mapfile ./rna.mapfile\
- 	--genomeDir /SGRNJ/Public/Database/genome/homo_mus\
- 	--thread 8\
- 	--mod shell
- 	```
-`--mapfile` Required. Mapfile path.
-
-`--genomeDir` Required. Required. Genome directory.
-
-`--thread` The recommended setting is 8, and the maximum should not exceed 20.
-
-`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. 
-
-Scripts above will generate a `shell` directory containing `{sample}.sh` files.
-
-You can start your analysis by running:
-```
-sh ./shell/{sample}.sh
-```
-
-- Single cell vdj
-
-```
-conda activate celescope
-multi_vdj \
- --mapfile ./vdj.mapfile \
- --type TCR \
- --thread 8 \
- --mod shell
-```  
-
-`--type` Required. TCR or BCR. 
-
-- Single cell tag
-
-```
-conda activate celescope
-multi_tag \
- --mapfile ./tag.mapfile\
- --barcode_fasta ./smk_barcode.fa\
- --fq_pattern L25C45\
- --mod shell
-```  
-
-`--barcode_fasta` Required. Tag barcode fasta file.
-```
->tag_0
-GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
->tag_1
-TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
->tag_2
-AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
->tag_3
-CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
-```  
-
-`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. 
-
-`L` linker(common sequences)  
-`C` tag barcode  
-
-## How to write mapfile
-
-Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files.
-
-1st column: Fastq file prefix.  
-2nd column: Fastq file directory path.  
-3rd column: Sample name, which is the prefix of all output files.  
-4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`.
-- `rna` Optional, forced cell number.
-- `vdj` Optional, matched_dir.
-- `tag` Required, matched_dir.
-
-### Example
-
-Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1.
-```
-$cat ./my.mapfile
-fastq_prefix1	fastq_dir1	sample1
-fastq_prefix2	fastq_dir2	sample1
-fastq_prefix3	fastq_dir1	sample2
-
-$ls fastq_dir1
-fastq_prefix1_1.fq.gz	fastq_prefix1_2.fq.gz
-fastq_prefix3_1.fq.gz	fastq_prefix3_2.fq.gz
-
-$ls fastq_dir2
-fastq_prefix2_1.fq.gz	fastq_prefix2_2.fq.gz
-```
-
-
- 
diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md
deleted file mode 100644
index 9ddfd1b3..00000000
--- a/docs/rna/analysis.md
+++ /dev/null
@@ -1,51 +0,0 @@
-## Features
-- Cell clustering with Seurat.
-
-- Calculate the marker gene of each cluster.
-
-- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
-
-## Output
-- `markers.tsv` Marker genes of each cluster.
-
-- `tsne_coord.tsv` t-SNE coordinates and clustering information.
-
-- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
-parameter is provided. The result contains 3 files:
-    - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
-it means that the given marker is not enough to identify the cluster.
-    - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
-    - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--save_rds` Write rds to disk.
-
-`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
-```
-cell_type	marker
-Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
-Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
-Epithelial	"CAPS,TMEM190,PIFO,SNTN"
-Fibroblast	"COL1A1,DCN,COL1A2,C1R"
-B_cell	"CD79A,IGKC,IGLC3,IGHG3"
-Myeloid	"LYZ,MARCO,FCGR3A"
-T_cell	"CD3D,TRBC1,TRBC2,TRAC"
-LUAD	"NKX2-1,NAPSA,EPCAM"
-LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
-```
-
-`--matrix_file` Required. Matrix_10X directory from step count.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md
deleted file mode 100644
index c1b3d592..00000000
--- a/docs/rna/mkref.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## Features
-- Create a genome reference directory.
-
-## Output
-
-- STAR genome index files
-
-- Genome refFlat file
-
-- Genome config file
-```
-$ cat celescope_genome.config
-[genome]
-genome_name = Homo_sapiens_ensembl_99
-genome_type = rna
-fasta = Homo_sapiens.GRCh38.dna.primary_assembly.fa
-gtf = Homo_sapiens.GRCh38.99.gtf
-refflat = Homo_sapiens_ensembl_99.refFlat
-```
-
-
-## Arguments
-`--genomeDir` Default='./'. Output directory.
-
-`--thread` Default=6. Threads to use.
-
-`--genome_name` Required, genome name.
-
-`--dry_run` Only write config file and exit.
-
-`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir.
-
-`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir.
-
-`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir.
-It is a plain text file with one gene per line. 
-If not provided, will use `MT-` and `mt-` to determine mitochondria genes.
-
diff --git a/docs/rna/star.md b/docs/rna/star.md
deleted file mode 100644
index ec3b5211..00000000
--- a/docs/rna/star.md
+++ /dev/null
@@ -1,56 +0,0 @@
-## Features
-- Align R2 reads to the reference genome with STAR.
-- Collect Metrics with Picard.
-
-## Output
-- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
-
-- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
-
-- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
-This is most useful for troubleshooting and debugging.
-
-- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
-% of mapped reads etc. It is updated in 1 minute intervals.
-
-- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
-very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
-then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
-(unlike the samtools agstat/idxstats, which count each mate separately). 
-Most of the information is collected about the UNIQUE mappers 
-(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
-Each splicing is counted in the numbers of splices, which would correspond to 
-summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
-i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
-
-- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads
-
-`--STAR_param` Other STAR parameters
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/rna_virus/analysis_rna_virus.md b/docs/rna_virus/analysis_rna_virus.md
deleted file mode 100644
index 8893b4e1..00000000
--- a/docs/rna_virus/analysis_rna_virus.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--assay` assay
-
-`--sample` sample name
-
-`--thread` None
-
-`--debug` debug
-
-`--matrix_file` matrix file
-
-`--virus_file` virus UMI count file
-
diff --git a/docs/rna_virus/count.md b/docs/rna_virus/count.md
deleted file mode 100644
index 182ecfa4..00000000
--- a/docs/rna_virus/count.md
+++ /dev/null
@@ -1,27 +0,0 @@
-## Features
-- count
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--bam` None
-
-`--force_cell_num` force cell number.
-
-`--genomeDir` genome directory.
-
-`--gtf` gtf file path.
-
-`--expected_cell_num` expected cell number.
-
-`--cell_calling_method` cell calling methods.
-
diff --git a/docs/rna_virus/count_virus.md b/docs/rna_virus/count_virus.md
deleted file mode 100644
index 60f09d5c..00000000
--- a/docs/rna_virus/count_virus.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--assay` assay
-
-`--sample` sample name
-
-`--thread` None
-
-`--debug` debug
-
-`--virus_bam` None
-
-`--barcode_file` None
-
diff --git a/docs/rna_virus/featureCounts.md b/docs/rna_virus/featureCounts.md
deleted file mode 100644
index 61e7f284..00000000
--- a/docs/rna_virus/featureCounts.md
+++ /dev/null
@@ -1,19 +0,0 @@
-
-
-## Arguments
-`--gtf_type` Specify feature type in GTF annotation.
-
-`--genomeDir` None
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--input` None
-
diff --git a/docs/rna_virus/sample.md b/docs/rna_virus/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/rna_virus/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/rna_virus/star.md b/docs/rna_virus/star.md
deleted file mode 100644
index c0d71407..00000000
--- a/docs/rna_virus/star.md
+++ /dev/null
@@ -1,56 +0,0 @@
-## Features
-- Align R2 reads to the reference genome with STAR.
-- Collect Metrics with Picard.
-
-## ## Output
-- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
-
-- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
-
-- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
-This is most useful for troubleshooting and debugging.
-
-- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
-% of mapped reads etc. It is updated in 1 minute intervals.
-
-- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
-very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
-then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
-(unlike the samtools agstat/idxstats, which count each mate separately). 
-Most of the information is collected about the UNIQUE mappers 
-(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
-Each splicing is counted in the numbers of splices, which would correspond to 
-summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
-i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
-
-- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads.
-
-`--STAR_param` Other STAR parameters.
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/rna_virus/star_virus.md b/docs/rna_virus/star_virus.md
deleted file mode 100644
index 7ef14bd0..00000000
--- a/docs/rna_virus/star_virus.md
+++ /dev/null
@@ -1,32 +0,0 @@
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
-is higher than or equal to this value.
-
-`--out_unmapped` Output unmapped reads
-
-`--STAR_param` Other STAR parameters
-
-`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
-
-`--starMem` Default `30`. Maximum memory that STAR can use.
-
-`--fq` Required. R2 fastq file.
-
-`--consensus_fq` Input fastq has been consensused
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--virus_genomeDir` virus genome dir
-
diff --git a/docs/snp/analysis_snp.md b/docs/snp/analysis_snp.md
deleted file mode 100644
index fb2bd136..00000000
--- a/docs/snp/analysis_snp.md
+++ /dev/null
@@ -1,23 +0,0 @@
-
-
-## Arguments
-`--annovar_config` annovar soft config file
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--match_dir` match_dir
-
-`--vcf` vcf file
-
-`--CID_file` CID_file
-
-`--variant_count_file` variant count file
-
diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md
deleted file mode 100644
index b78d34f8..00000000
--- a/docs/snp/mkref.md
+++ /dev/null
@@ -1,21 +0,0 @@
-## Features
-- Create dictionary file and fasta index for gatk SplitNCigarReads.
-(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) 
-Need to run `celescope rna mkref` first
-
-## Output
-- fasta index
-- gatk dictionary file
-
-
-## Arguments
-`--genomeDir` Default='./'. Output directory.
-
-`--thread` Default=6. Threads to use.
-
-`--genome_name` Required, genome name.
-
-`--dry_run` Only write config file and exit.
-
-`--fasta` fasta file
-
diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md
deleted file mode 100644
index aed2d6fa..00000000
--- a/docs/snp/variant_calling.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## Features
-- Perform variant calling.
-
-## Output
-
-`{sample}_VID.tsv` A unique numeric ID is assigned for each variant.
-
-`{sample}_CID.tsv` A unique numeric ID is assigned for each cell.
-
-`{sample}_variant_count.tsv`  Reference and variant supporting reads/UMIs count.
-
-`{sample}_support.mtx` Support matrix, only high quality bases are considered.   
-0 : no reads/UMIs cover the position.  
-1 : all reads/UMIs at the position support the ref allele.  
-2 : all reads/UMIs at the position support the alt allele.  
-3 : one or more reads/UMIs support both the alt and the ref allele.  
-
-
-## Arguments
-`--genomeDir` Genome directory after running `mkref`.
-
-`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
-and use these variants as input vcf.
-
-`--bam` Input BAM file from step `target_metrics`.
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tag/analysis_tag.md b/docs/tag/analysis_tag.md
deleted file mode 100644
index da0f476e..00000000
--- a/docs/tag/analysis_tag.md
+++ /dev/null
@@ -1,19 +0,0 @@
-## Features
-- Combine scRNA-Seq clustering infromation with tag assignment.
-
-
-## Arguments
-`--tsne_tag_file` `{sample}_tsne_tag.tsv` from count_tag.
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tag/count_tag.md b/docs/tag/count_tag.md
deleted file mode 100644
index 815b3eb1..00000000
--- a/docs/tag/count_tag.md
+++ /dev/null
@@ -1,44 +0,0 @@
-## Features
-- Assign tag to each cell barcode and summarize.
-
-## Output
-
-- `{sample}_umi_tag.tsv` 
-
-    `first column` cell barcode  
-    `last column`  assigned tag  
-    `columns between first and last` UMI count for each tag 
-
-- `{sample}_tsne_tag.tsv` it is `{sample}_umi_tag.tsv` with t-SNE coordinates, gene_counts and cluster infomation
-
-- `{sample}_cluster_count.tsv` cell barcode number assigned to *undeterminded*, *multiplet* and *each tag*
-
-
-## Arguments
-`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.
-
-`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag.
-
-`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. 
-Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*.
-
-`--combine_cluster` Conbine cluster tsv file.
-
-`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as 
-`SNR_min = max(median(SNRs) * coefficient, 2)`. 
-Smaller `coefficient` will cause less *multiplet* in the tag assignment.
-
-`--read_count_file` Tag read count file.
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tag/mapping_tag.md b/docs/tag/mapping_tag.md
deleted file mode 100644
index 1f7ab1ae..00000000
--- a/docs/tag/mapping_tag.md
+++ /dev/null
@@ -1,48 +0,0 @@
-## Features
-- Align R2 reads to the tag barcode fasta.
-
-## Output
-
-- `{sample}_read_count.tsv` tab-delimited text file with 4 columns.
-
-    `barcode` cell barcode  
-    `tag_name`  tag name in barcode_fasta  
-    `UMI`   UMI sequence  
-    `read_count` read count per UMI  
-
-
-## Arguments
-`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases.         
-`L` linker(common sequences)  
-`C` tag barcode
-
-`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode 
-sequence in R2 reads with all tag barcode sequence in barcode_fasta. 
-It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. 
-If no such tag exists, the read is classified as invalid.
-```
->tag_0
-GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
->tag_1
-TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
->tag_2
-AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
->tag_3
-CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
-```
-
-`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads 
-with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq` R2 read fastq.
-
diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md
deleted file mode 100644
index 5a43f7f8..00000000
--- a/docs/tag/split_tag.md
+++ /dev/null
@@ -1,26 +0,0 @@
-## Features
-- Split scRNA-Seq fastq according to tag assignment.
-
-## Output
-- `fastq/{tag}_{1,2}.fq` Fastq files of each tag.
-
-
-## Arguments
-`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment.
-
-`--umi_tag_file` UMI tag file.
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--R1_read` R1 read path.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tcr_fl/assemble.md b/docs/tcr_fl/assemble.md
deleted file mode 100644
index 95662af1..00000000
--- a/docs/tcr_fl/assemble.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fastq_dir` None
-
diff --git a/docs/tcr_fl/sample.md b/docs/tcr_fl/sample.md
deleted file mode 100644
index 87ee3cfe..00000000
--- a/docs/tcr_fl/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file.
-
-`--chemistry` chemistry version.
-
diff --git a/docs/tcr_fl/split_fq.md b/docs/tcr_fl/split_fq.md
deleted file mode 100644
index fe767158..00000000
--- a/docs/tcr_fl/split_fq.md
+++ /dev/null
@@ -1,15 +0,0 @@
-
-
-## Arguments
-`--outdir` output dir
-
-`--sample` sample name
-
-`--fq` None
-
-`--assay` assay
-
-`--match_dir` match scRNA-Seq dir
-
-`--nCell` select top N cell
-
diff --git a/docs/tools/barcode.md b/docs/tools/barcode.md
deleted file mode 100644
index 9f31bd94..00000000
--- a/docs/tools/barcode.md
+++ /dev/null
@@ -1,61 +0,0 @@
-## Features
-
-- Demultiplex barcodes.
-- Filter invalid R1 reads, which includes:
-    - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2.  
-    - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1.  
-    - Reads without polyT: the number of T bases in the defined polyT region is less than 10.
-    - Low quality reads: low sequencing quality in barcode and UMI regions.
-
-## Output
-
-- `01.barcode/{sample}_2.fq(.gz)` Demultiplexed R2 reads. Barcode and UMI are contained in the read name. The format of 
-the read name is `{barcode}_{UMI}_{read ID}`.
-
-
-## Arguments
-`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
-- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
-- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
-- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
-same time.
-
-`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
-        of bases.  
-- `C`: cell barcode  
-- `L`: linker(common sequences)  
-- `U`: UMI    
-- `T`: poly T
-
-`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
-
-`--linker` Linker whitelist file path, one linker per line.
-
-`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
-
-`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
-
-`--nopolyT` Outputs R1 reads without polyT.
-
-`--noLinker` Outputs R1 reads without correct linker.
-
-`--allowNoPolyT` Allow valid reads without polyT.
-
-`--allowNoLinker` Allow valid reads without correct linker.
-
-`--gzip` Output gzipped fastq files.
-
-`--fq1` R1 fastq file. Multiple files are separated by comma.
-
-`--fq2` R2 fastq file. Multiple files are separated by comma.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tools/consensus.md b/docs/tools/consensus.md
deleted file mode 100644
index 77e11286..00000000
--- a/docs/tools/consensus.md
+++ /dev/null
@@ -1,24 +0,0 @@
-## Features
-- Consensus all the reads of the same (barcode, UMI) combinations into one read(UMI).
-
-## Output
-- `{sample}_consensus.fq` Consensus fastq.
-
-
-## Arguments
-`--threshold` Default 0.5. Valid base threshold.
-
-`--not_consensus` Skip the consensus step.
-
-`--fq` Required. Fastq file.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tools/count.md b/docs/tools/count.md
deleted file mode 100644
index 102eb020..00000000
--- a/docs/tools/count.md
+++ /dev/null
@@ -1,61 +0,0 @@
-## Features
-- Cell-calling: Distinguish cell barcodes from background barcodes. 
-
-- Generate expression matrix.
-
-## Output
-- `{sample}_all_matrix` The expression matrix of all detected barcodes. 
-    Can be read in by calling the `Seurat::Read10X` function.
-
-- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. 
-Can be read in by calling the `Seurat::Read10X` function.
-
-- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. 
-CeleScope >=1.2.0 does not output this file.
-
-- `{sample}_count_detail.txt.gz` 4 columns: 
-    - barcode  
-    - gene ID  
-    - UMI count  
-    - read_count  
-
-- `{sample}_counts.txt` 6 columns:
-    - Barcode: barcode sequence
-    - readcount: read count of each barcode
-    - UMI2: UMI count (with reads per UMI >= 2) for each barcode
-    - UMI: UMI count for each barcode
-    - geneID: gene count for each barcode
-    - mark: cell barcode or backgound barcode.
-
-        `CB` cell  
-        `UB` background  
-
-- `{sample}_downsample.txt` 3 columns：
-    - percent: percentage of sampled reads
-    - median_geneNum: median gene number per cell
-    - saturation: sequencing saturation
-
-- `barcode_filter_magnitude.pdf` Barcode-UMI plot.
-
-
-## Arguments
-`--genomeDir` Required. Genome directory.
-
-`--expected_cell_num` Default `3000`. Expected cell number.
-
-`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--bam` Required. BAM file from featureCounts.
-
-`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%.
-
diff --git a/docs/tools/cutadapt.md b/docs/tools/cutadapt.md
deleted file mode 100644
index e75d6e72..00000000
--- a/docs/tools/cutadapt.md
+++ /dev/null
@@ -1,44 +0,0 @@
-## Features
-- Trim adapters in R2 reads with cutadapt. Default adapters includes:
-    - polyT=A{18}, 18 A bases. 
-    - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
-
-## Output
-- `cutadapt.log` Cutadapt output log file.
-- `{sample}_clean_2.fq.gz` R2 reads file without adapters.
-
-
-## Arguments
-`--adapter_fasta` Addtional adapter fasta file.
-
-`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
-
-`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
-Some Illumina instruments use a two-color chemistry to encode the four bases. 
-This includes the NextSeq and the NovaSeq. 
-In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
-However, dark cycles also occur when sequencing “falls off” the end of the fragment.
-The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
-
-`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
-short matches can occur by chance, leading to erroneously trimmed bases. 
-For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
-To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
-at least {overlap} bases match between adapter and read.
-
-`--insert` Default `150`. Read2 insert length.
-
-`--fq` Required. R2 reads from step Barcode.
-
-`--gzip` Output gzipped fastq
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tools/featureCounts.md b/docs/tools/featureCounts.md
deleted file mode 100644
index 3822ca55..00000000
--- a/docs/tools/featureCounts.md
+++ /dev/null
@@ -1,38 +0,0 @@
-## Features
-
-- Assigning uniquely mapped reads to genomic features with FeatureCounts.
-
-## Output
-- `{sample}` Numbers of reads assigned to features (or meta-features).
-
-- `{sample}_summary` Stat info for the overall summrization results, including number of 
-successfully assigned reads and number of reads that failed to be assigned due to 
-various reasons (these reasons are included in the stat info).
-
-- `{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam` featureCounts output BAM, 
-sorted by coordinates；BAM file contains tags as following(Software Version>=1.1.8):
-    - CB cell barcode
-    - UB UMI
-    - GN gene name
-    - GX gene id
-
-- `{sample}_name_sorted.bam` featureCounts output BAM, sorted by read name.
-
-
-## Arguments
-`--gtf_type` Specify feature type in GTF annotation
-
-`--genomeDir` Required. Genome directory.
-
-`--input` Required. BAM file path.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/tools/sample.md b/docs/tools/sample.md
deleted file mode 100644
index e6fb6ce3..00000000
--- a/docs/tools/sample.md
+++ /dev/null
@@ -1,17 +0,0 @@
-
-
-## Arguments
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-`--fq1` read1 fq file
-
-`--chemistry` chemistry version
-
diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md
deleted file mode 100644
index d2fbaa04..00000000
--- a/docs/tools/target_metrics.md
+++ /dev/null
@@ -1,28 +0,0 @@
-## Features
-- Filter bam file
-    - Filter reads that are not cell-associated.
-    - Filter reads that are not mapped to target genes. 
-
-- Collect enrichment metrics.
-
-## Output
-- `filtered.bam` BAM file after filtering.
-
-
-## Arguments
-`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
-
-`--bam` Input bam file
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/vdj/count_vdj.md b/docs/vdj/count_vdj.md
deleted file mode 100644
index bd10f86d..00000000
--- a/docs/vdj/count_vdj.md
+++ /dev/null
@@ -1,37 +0,0 @@
-## Features
-- Cell-calling based on barcode-UMI rank.    
-- Summarize clonetypes infomation.
-
-## Output
-- `{sample}_cell_confident.tsv` The clone type of VDJ cell barcode, each chain occupies one line.
-
-- `{sample}_cell_confident_count.tsv` The clone type of VDJ cell barcode, each cell occupies one line.
-
-- `{sample}_clonetypes.tsv` The count and percentage of each clonetypes of VDJ cell barcode.
-
-- `{sample}_match_clonetypes.tsv` When summarize clonetypes, only consider barcodes in the match scRNA-Seq library. 
-This file will only be produced when the `match_dir` parameter is provided.
-
-
-## Arguments
-`--type` Required. `TCR` or `BCR`.
-
-`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.
-
-`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. 
-For each (barcode, chain) combination, only UMI>=iUMI is considered valid.
-
-`--UMI_count_filter_file` Required. File from step mapping_vdj.
-
-`--match_dir` Match celescope scRNA-Seq directory.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
diff --git a/docs/vdj/mapping_vdj.md b/docs/vdj/mapping_vdj.md
deleted file mode 100644
index 25bb304a..00000000
--- a/docs/vdj/mapping_vdj.md
+++ /dev/null
@@ -1,35 +0,0 @@
-## Features
-- Align R2 reads to IGMT(http://www.imgt.org/) database sequences with mixcr.
-
-## Output
-- `{sample}_consensus.fasta` Fasta file after UMI consensus.
-
-- `{sample}_UMI_count_unfiltered.tsv` UMI reading for each (barcode, chain, VJ_pair) combination.
-
-- `{sample}_UMI_count_filtered.tsv` For each (barcode, chain) combination, only the record with the 
-most VJ_pair UMI reads is kept.
-
-- `{sample}_align.txt` Result report.
-
-- `{sample}_alignments.txt` The alignment result of each UMI/read.
-
-
-## Arguments
-`--type` TCR or BCR
-
-`--species` Default `hs`. `hs`(human) or `mmu`(mouse).
-
-`--not_consensus` Input fastq is not consensused.
-
-`--fq` Required. Input fastq file.
-
-`--outdir` Output diretory.
-
-`--assay` Assay name.
-
-`--sample` Sample name.
-
-`--thread` Thread to use.
-
-`--debug` If this argument is used, celescope may output addtional file for debugging.
-
-- 
Gitee


From 0681e783c0ebe8fbc702c28e23cb4e1dcf76448c Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 13:15:13 +0800
Subject: [PATCH 86/96] refactor

---
 celescope/snp/mkref.py           |   4 +-
 celescope/tools/multi.py         |  41 +++----
 celescope/tools/utils.py         | 106 ------------------
 docs_template/CHANGELOG.md       | 178 +++++++++++++++++++++++++++++++
 docs_template/CONTRIBUTING.md    | 106 ++++++++++++++++++
 docs_template/manual_template.md |  23 ++++
 docs_template/quick_start.md     | 110 +++++++++++++++++++
 generate_docs.py                 |  90 ++++++++++------
 methods/rna.txt                  |  10 ++
 9 files changed, 508 insertions(+), 160 deletions(-)
 create mode 100755 docs_template/CHANGELOG.md
 create mode 100644 docs_template/CONTRIBUTING.md
 create mode 100644 docs_template/manual_template.md
 create mode 100755 docs_template/quick_start.md
 create mode 100755 methods/rna.txt

diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py
index 32fd2dd0..e8659428 100644
--- a/celescope/snp/mkref.py
+++ b/celescope/snp/mkref.py
@@ -21,8 +21,8 @@ class Mkref_snp(Mkref):
     Usage
     ```
     # run celescope rna mkref first
-    celescope snp mkref \
-     --genome_name Homo_sapiens_ensembl_99 \
+    celescope snp mkref \\
+     --genome_name Homo_sapiens_ensembl_99 \\
      --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa
     ```
     """
diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py
index 4df0d7e8..75b3ea7e 100755
--- a/celescope/tools/multi.py
+++ b/celescope/tools/multi.py
@@ -20,8 +20,6 @@ class Multi():
         self.__STEPS__ = init_module.__STEPS__
         self.__CONDA__ = os.path.basename(os.environ['CONDA_DEFAULT_ENV'])
         self.__APP__ = 'celescope'
-        self.col4_default = None
-        self.last_step = ''
         self.steps_not_run = ['mkref']
 
         # remove
@@ -29,31 +27,26 @@ class Multi():
             if step in self.__STEPS__:
                 self.__STEPS__.remove(step)
 
-        # parse_args
+        # add args
+        self.parser = None
         self.common_args()
         self.step_args()
-        self.args = self.parser.parse_args()
-        if self.args.gzip:
-            self.fq_suffix = ".gz"
-        else:
-            self.fq_suffix = ""
-        if self.args.steps_run == 'all':
-            self.steps_run = self.__STEPS__
-        elif self.args.steps_run:
-            self.steps_run = self.args.steps_run.strip().split(',')
 
-        # init
+        # set
+        self.args = None
+        self.col4_default = None
+        self.last_step = ''
+        self.fq_suffix = ""
+        self.steps_run = self.__STEPS__
         self.fq_dict = {}
         self.col4_dict = {}
         self.col5_dict = {}
-        self.logdir = self.args.outdir + '/log'
+        self.logdir = None
 
-        # script init
-        self.sjm_cmd = f'log_dir {self.logdir}\n'
+        self.sjm_cmd = ''
         self.sjm_order = ''
         self.shell_dict = defaultdict(str)
 
-        # outdir dict
         self.outdir_dic = {}
 
 
@@ -90,7 +83,7 @@ class Multi():
 
     @staticmethod
     @utils.add_log
-    def parse_map_col4(mapfile, default_val):
+    def parse_mapfile(mapfile, default_val):
         fq_dict = defaultdict(list)
         col4_dict = {}
         col5_dict = {}
@@ -137,8 +130,18 @@ class Multi():
         """
         parse_mapfile, make log dir, init script variables, init outdir_dic
         """
+        self.args = self.parser.parse_args()
+
+        if self.args.gzip:
+            self.fq_suffix = ".gz"
+        if self.args.steps_run != 'all':
+            self.steps_run = self.args.steps_run.strip().split(',')
+
+        self.logdir = self.args.outdir + '/log'
+        self.sjm_cmd = f'log_dir {self.logdir}\n'
+
         # parse_mapfile
-        self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default)
+        self.fq_dict, self.col4_dict, self.col5_dict = self.parse_mapfile(self.args.mapfile, self.col4_default)
 
         # mk log dir
         if self.args.mod == 'sjm':
diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py
index 36e217e5..65c0f4ad 100755
--- a/celescope/tools/utils.py
+++ b/celescope/tools/utils.py
@@ -419,80 +419,6 @@ def get_fq(library_id, library_path):
     return fq1, fq2
 
 
-@add_log
-def parse_map_col4(mapfile, default_val):
-    fq_dict = defaultdict(list)
-    col4_dict = defaultdict(list)
-    col5_dict = defaultdict(list)
-    with open(mapfile) as fh:
-        for line in fh:
-            line = line.strip()
-            if not line:
-                continue
-            if line.startswith('#'):
-                continue
-            tmp = line.split()
-            library_id = tmp[0]
-            library_path = tmp[1]
-            sample_name = tmp[2]
-            if len(tmp) >= 4:
-                col4 = tmp[3]
-            else:
-                col4 = default_val
-            fq1, fq2 = get_fq(library_id, library_path)
-
-            if sample_name in fq_dict:
-                fq_dict[sample_name][0].append(fq1)
-                fq_dict[sample_name][1].append(fq2)
-            else:
-                fq_dict[sample_name] = [[fq1], [fq2]]
-            if col4 and col4 != default_val:
-                col4_dict[sample_name] = col4
-            if len(tmp) == 5:
-                col5_dict[sample_name] = tmp[4]
-
-    for sample_name in fq_dict:
-        fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0])
-        fq_dict[sample_name][1] = ",".join(fq_dict[sample_name][1])
-
-    if not fq_dict:
-        raise Exception('empty mapfile!')
-    return fq_dict, col4_dict, col5_dict
-
-
-def generate_sjm(cmd, name, conda, m=1, x=1):
-    res_cmd = f'''
-job_begin
-    name {name}
-    sched_options -w n -cwd -V -l vf={m}g,p={x}
-    cmd source activate {conda}; {cmd}
-job_end
-'''
-
-    return res_cmd
-
-
-def merge_report(
-        fq_dict, steps, last_step, sjm_cmd,
-        sjm_order, logdir, conda, outdir, rm_files):
-    step = "merge_report"
-    steps_str = ",".join(steps)
-    samples = ','.join(fq_dict.keys())
-    app = tools_dir + '/merge_table.py'
-    cmd = (
-        f'python {app} --samples {samples} '
-        f'--steps {steps_str} --outdir {outdir}'
-    )
-    if rm_files:
-        cmd += ' --rm_files'
-    sjm_cmd += generate_sjm(cmd, 'merge_report', conda)
-    for sample in fq_dict:
-        sjm_order += f'order {step} after {last_step}_{sample}\n'
-    with open(logdir + '/sjm.job', 'w') as fh:
-        fh.write(sjm_cmd + '\n')
-        fh.write(sjm_order)
-
-
 def format_number(number: int) -> str:
     return format(number, ",")
 
@@ -533,38 +459,6 @@ def genDict(dim=3, valType=int):
         return defaultdict(lambda: genDict(dim - 1, valType=valType))
 
 
-def cluster_tsne_list(tsne_df):
-    """
-    tSNE_1	tSNE_2	cluster Gene_Counts
-    return data list
-    """
-    sum_df = tsne_df.groupby(["cluster"]).agg("count").iloc[:, 0]
-    percent_df = sum_df.transform(lambda x: round(x / sum(x) * 100, 2))
-    res = []
-    for cluster in sorted(tsne_df.cluster.unique()):
-        sub_df = tsne_df[tsne_df.cluster == cluster]
-        name = "cluster {cluster}({percent}%)".format(
-            cluster=cluster, percent=percent_df[cluster])
-        tSNE_1 = list(sub_df.tSNE_1)
-        tSNE_2 = list(sub_df.tSNE_2)
-        res.append({"name": name, "tSNE_1": tSNE_1, "tSNE_2": tSNE_2})
-    return res
-
-
-def marker_table(marker_df):
-    """
-    return html code
-    """
-    marker_df = marker_df.loc[:, ["cluster", "gene",
-                                  "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]]
-    marker_gene_table = marker_df.to_html(
-        escape=False,
-        index=False,
-        table_id="marker_gene_table",
-        justify="center")
-    return marker_gene_table
-
-
 def report_prepare(outdir, **kwargs):
     json_file = outdir + '/../.data.json'
     if not os.path.exists(json_file):
diff --git a/docs_template/CHANGELOG.md b/docs_template/CHANGELOG.md
new file mode 100755
index 00000000..109dd6d7
--- /dev/null
+++ b/docs_template/CHANGELOG.md
@@ -0,0 +1,178 @@
+# Change Log
+
+## [unreleased] - 2021-06-09
+### Added
+
+### Changed
+
+### Fixed
+- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`.
+
+### Removed
+- `celescope.tools.utils.glob_genomeDir`
+
+## [1.3.1] - 2021-06-09
+### Added
+
+- Add wdl workflow.
+
+- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549.
+
+### Changed
+
+- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered)
+
+### Fixed and Removed
+
+- Remove h5 file generation in R to avoid memory issues.
+
+
+## [1.3.0] - 2021-05-28
+ 
+### Added
+
+- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details.
+
+### Changed
+
+- Change the way to handle duplicate gene_name and gene_id in gtf file.
+
+Previous:
+
+    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
+    - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name.
+    - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name.
+
+Now:
+
+    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
+    - one gene_id with multiple gene_name: error.
+    - duplicated (gene_name, gene_id): ignore duplicated records and print a warning.
+
+### Fixed
+
+- Fix `count tag` metrics order in merge.xls
+
+### Removed
+
+- Remove `--fusion_pos` from `celescope.fusion.count_fusion`
+
+ 
+## [1.2.0] - 2021-05-19
+ 
+### Added
+
+- Assay `rna` outputs .h5 file in 06.analysis directory.
+
+### Changed
+
+- Update Seurat from 2.3.4 to 4.0.1.
+
+- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding.
+
+- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136
+
+### Removed
+
+- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory.
+
+ 
+## [1.1.9] - 2021-04-25
+ 
+### Added
+
+- Add parameter `--coefficient`  to `celescope tag count_tag` and `multi_tag`
+    
+    Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)`
+
+- Add `.metrics.json`
+
+- Add `scopeV1` chemistry support.
+ 
+### Changed
+  
+- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage).
+
+- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage.
+
+- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output.
+
+- Change the display of Barcode-rank plot in html report. 
+
+### Fixed
+
+- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2.
+
+- Fix an error when Numpy >= 1.2.0.
+
+- VDJ merge.xls can display all the metrics correctly.
+
+### Removed
+
+- Remove fastqc from `barcode` step.
+
+ 
+## [1.1.8] - 2021-03-26
+ 
+### Added
+
+- Add read consensus to VDJ pipeline. 
+
+    A consensus step was added before mapping to merge all the reads of the same
+    (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`.  
+    multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7.   
+
+- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`.
+
+    `--species` can be one of:
+    - `hs`: human
+    - `mmu`: mouse
+
+- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`.
+
+    `--cell_calling_method` can be one of:  
+    - `auto`: Same result as v1.1.7.  
+    - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3.  
+    - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points.  
+
+- Add 4 tags to featureCounts bam.
+
+    - `CB`: cell barcode
+    - `UB`: UMI
+    - `GN`: gene name
+    - `GX`: gene id
+
+- Add `--STAR_param` to `celescope rna STAR`
+
+    Additional parameters of STAR can be passed into the `STAR` step.
+
+### Changed
+
+- One sample can have different chemistry fastq in mapfile.  Version <= v1.1.7 will report this as an error.
+
+- Gtf file can be gzipped.
+
+- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` 
+
+- Step `snpCalling` use mutract.
+
+
+## [1.1.7] - 2020-12-16
+
+### Added
+
+- Automatically detect Singleron chemistry version.
+
+### Changed
+
+- FeatureCounts use strand specificity.
+
+- Cutadapt default `overlap` change from `5` to `10`.
+
+- VDJ sort `NA` last.
+
+- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID.
+
+
+
+
diff --git a/docs_template/CONTRIBUTING.md b/docs_template/CONTRIBUTING.md
new file mode 100644
index 00000000..fb9ee986
--- /dev/null
+++ b/docs_template/CONTRIBUTING.md
@@ -0,0 +1,106 @@
+## Pull Requests
+Create pull requests to `dev` branch
+
+## Lint code
+Before pull requests, you should lint your code with the following command:
+```
+pip install pylint
+# lint
+# W1618 (no-absolute-import)
+# E1101 (no-member)
+# W1633 (round-builtin)
+# W1619 (old-division)
+# W0105 (String statement has no effect)
+# W0511 TODO!
+# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type)
+# W0212 Access to a protected member _option_string_actions of a client class (protected-access)
+pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope
+```
+Your code should be rated at 10(i.e. no error or warning). 
+
+## Write a new step
+When you add a new step, you need to
+  - Create a step class which inherit from `celescope.tools.step.Step`. 
+  - Create a function with the same name of the module. The main function `celescope` uses this function to run each step.
+  - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface.
+
+For example, in `celescope.tools.cutadapt`:
+
+```
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+
+class Cutadapt(Step):
+    """
+    Features
+    - Trim adapters in R2 reads with cutadapt. Default adapters includes:
+	- polyT=A{18}, 18 A bases. 
+	- p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
+
+    Output
+    - `cutadapt.log` Cutadapt output log file.
+    - `{sample}_clean_2.fq.gz` R2 reads file without adapters.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        {some init code}
+
+    @utils.add_log
+    def run(self):
+        {some code to run}
+
+
+@utils.add_log
+def cutadapt(args):
+    step_name = "cutadapt"
+    cutadapt_obj = Cutadapt(args, step_name)
+    cutadapt_obj.run()
+
+
+def get_opts_cutadapt(parser, sub_program):
+    parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.')
+    parser.add_argument(
+        '--minimum_length',
+        help='Default `20`. Discard processed reads that are shorter than LENGTH.', 
+        default=20
+    )
+    {other arguments}
+    if sub_program:
+        parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True)
+        parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true')
+        parser = s_common(parser)
+    return parser
+```
+
+## Docs
+There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect:
+- Docstring of the step class. The Docstring should have sections named `Features` and `Output`.
+- Help infomation in `get_opts_{module_name}`
+  
+Released assays will be added to `manual.md`.
+
+## Tests
+If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example:
+
+
+1. Get test data
+```
+# If you have access to Singleron Nanjing HPC
+copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir}
+# Or clone from repo
+git clone https://github.com/singleron-RD/celescope_tests.git
+```
+
+2. Run `pytest`
+```
+Install pytest
+>>> pip install pytest
+Run all
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir}
+Run some tests
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag
+```
+
+Then you need to create your own test based on this example.
\ No newline at end of file
diff --git a/docs_template/manual_template.md b/docs_template/manual_template.md
new file mode 100644
index 00000000..c524de94
--- /dev/null
+++ b/docs_template/manual_template.md
@@ -0,0 +1,23 @@
+## Introduction
+CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria.
+
+Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming.
+
+Currently, CeleScope includes the follwing pipelines:
+
+- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional).
+
+- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting.
+
+- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification.
+
+
+## [Quick start](quick_start.md)
+
+## [Change log](CHANGELOG.md)
+
+## Pre-processing
+
+- [barcode](tools/barcode.md)
+- [cutadapt](tools/cutadapt.md)
+
diff --git a/docs_template/quick_start.md b/docs_template/quick_start.md
new file mode 100755
index 00000000..57d2327f
--- /dev/null
+++ b/docs_template/quick_start.md
@@ -0,0 +1,110 @@
+# Quick start
+
+CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of:
+
+- rna
+- vdj
+- tag
+
+Run `multi_{assay} -h` for help.
+
+
+## Usage Example
+
+- Single-cell rna
+
+	```
+	conda activate celescope
+	multi_rna\
+ 	--mapfile ./rna.mapfile\
+ 	--genomeDir /SGRNJ/Public/Database/genome/homo_mus\
+ 	--thread 8\
+ 	--mod shell
+ 	```
+`--mapfile` Required. Mapfile path.
+
+`--genomeDir` Required. Required. Genome directory.
+
+`--thread` The recommended setting is 8, and the maximum should not exceed 20.
+
+`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. 
+
+Scripts above will generate a `shell` directory containing `{sample}.sh` files.
+
+You can start your analysis by running:
+```
+sh ./shell/{sample}.sh
+```
+
+- Single cell vdj
+
+```
+conda activate celescope
+multi_vdj \
+ --mapfile ./vdj.mapfile \
+ --type TCR \
+ --thread 8 \
+ --mod shell
+```  
+
+`--type` Required. TCR or BCR. 
+
+- Single cell tag
+
+```
+conda activate celescope
+multi_tag \
+ --mapfile ./tag.mapfile\
+ --barcode_fasta ./smk_barcode.fa\
+ --fq_pattern L25C45\
+ --mod shell
+```  
+
+`--barcode_fasta` Required. Tag barcode fasta file.
+```
+>tag_0
+GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
+>tag_1
+TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
+>tag_2
+AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
+>tag_3
+CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
+```  
+
+`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. 
+
+`L` linker(common sequences)  
+`C` tag barcode  
+
+## How to write mapfile
+
+Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files.
+
+1st column: Fastq file prefix.  
+2nd column: Fastq file directory path.  
+3rd column: Sample name, which is the prefix of all output files.  
+4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`.
+- `rna` Optional, forced cell number.
+- `vdj` Optional, matched_dir.
+- `tag` Required, matched_dir.
+
+### Example
+
+Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1.
+```
+$cat ./my.mapfile
+fastq_prefix1	fastq_dir1	sample1
+fastq_prefix2	fastq_dir2	sample1
+fastq_prefix3	fastq_dir1	sample2
+
+$ls fastq_dir1
+fastq_prefix1_1.fq.gz	fastq_prefix1_2.fq.gz
+fastq_prefix3_1.fq.gz	fastq_prefix3_2.fq.gz
+
+$ls fastq_dir2
+fastq_prefix2_1.fq.gz	fastq_prefix2_2.fq.gz
+```
+
+
+ 
diff --git a/generate_docs.py b/generate_docs.py
index a43105a0..e4788abc 100644
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -1,6 +1,7 @@
 import argparse
 import inspect
 import os
+import importlib
 from collections import defaultdict
 
 import celescope.tools.utils as utils
@@ -8,36 +9,13 @@ from celescope.celescope import ArgFormatter
 from celescope.__init__ import ASSAY_DICT, RELEASED_ASSAYS
 
 PRE_PROCESSING_STEPS = ('sample', 'barcode', 'cutadapt')
-DOCS_ROOT = 'docs'
-MANUAL_MD = f'{DOCS_ROOT}/manual.md'
-MANUAL_TEMPLATE = f'{DOCS_ROOT}/manual_template.md'
+DOCS_DIR = 'docs/'
+TEMPLATE_DIR = 'docs_template/'
+MANUAL_MD = f'{DOCS_DIR}/manual.md'
+MANUAL_TEMPLATE = f'{DOCS_DIR}/manual_template.md'
 
 
-def generate_single_step_doc(assay, step):
-    """
-    Returns: 
-        - md file relative to DOCS_ROOT 
-    """
-    step_module, folder = utils.find_step_module_with_folder(assay, step)
-    func_opts = getattr(step_module, f"get_opts_{step}")
-    
-    class_docs = get_class_docs(step_module)
-    argument_docs = get_argument_docs(func_opts)
-
-    folder_path = f'{DOCS_ROOT}/{folder}/'
-    if not os.path.exists(folder_path):
-        os.system(f'mkdir -p {folder_path}')
-
-    out_md = f'{DOCS_ROOT}/{folder}/{step}.md'
-    with open(out_md, 'w') as out_file:
-        out_file.write(class_docs)
-        out_file.write(argument_docs)
-    return f'{folder}/{step}.md'
-
-def get_argument_docs(func_opts):
-    argument_docs = ""
-    parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter)
-    func_opts(parser, sub_program=True)
+def get_argument_docs_from_parser(parser):
     for argument in parser._option_string_actions:
         if not argument in ['-h', '--help']:
             help_msg = parser._option_string_actions[argument].help
@@ -69,6 +47,46 @@ def get_class_docs(step_module):
     return class_docs
 
 
+class Docs():
+    def __init__(self, assay):
+        self.assay = assay
+
+        init_module = utils.find_assay_init(assay)
+        self.steps = init_module.__STEPS__
+        self.steps.append(f'multi_{assay}')
+        folder = f'{DOCS_DIR}/{assay}/'
+
+        self.out_md_dict = {}
+        self.relative_md_path = {}
+        for step in self.steps:
+            self.out_md_dict[step] = f'{folder}/{step}.md'
+            self.relative_md_path[step] = f'{assay}/{step}.md'
+
+        if not os.path.exists(folder):
+            os.system(f'mkdir -p {folder}')    
+
+    def get_argument_docs(self, step, step_module):
+        if step.startswith("multi"):
+            multi_class = getattr(step_module, f'Multi_{self.assay}')
+            multi_obj = multi_class(self.assay)
+            argument_docs = get_argument_docs_from_parser(multi_obj.parser)
+        else:
+            parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter)
+            func_opts = getattr(step_module, f"get_opts_{step}")
+            func_opts(parser, sub_program=True)
+            argument_docs = get_argument_docs_from_parser(parser)
+        return argument_docs   
+
+
+    def write_step_doc(self, step):
+        step_module = utils.find_step_module(self.assay, step)
+        class_docs = get_class_docs(step_module)
+        argument_docs = self.get_argument_docs(step, step_module)
+
+        with open(self.out_md_dict[step], 'w') as out_file:
+            out_file.write(class_docs)
+            out_file.write(argument_docs)
+
 def write_step_in_manual(md_path, step, manual_handle):
     """
     - [mkref](rna/mkref.md)
@@ -77,19 +95,24 @@ def write_step_in_manual(md_path, step, manual_handle):
         manual_handle.write(f'- [{step}]({md_path})\n')
     
 
-
+"""
 @utils.add_log
 def generate_all_docs():
     md_path_dict = defaultdict(dict)
+
     for assay in ASSAY_DICT:
         init_module = utils.find_assay_init(assay)
-        __STEPS__ = init_module.__STEPS__
+        steps = init_module.__STEPS__
         generate_all_docs.logger.info(f"Writing docs {assay} ")
-        for step in __STEPS__:
+
+        steps.append(f'multi_{assay}')
+        for step in steps:
             generate_all_docs.logger.info(f"Writing doc {assay}.{step}")
             md_path = generate_single_step_doc(assay, step)
             md_path_dict[assay][step] = md_path
     return md_path_dict
+"""
+
 
 @utils.add_log
 def write_manual(md_path_dict):
@@ -108,5 +131,6 @@ def write_manual(md_path_dict):
 
 
 if __name__ == "__main__":
-    md_path_dict = generate_all_docs()
-    write_manual(md_path_dict)
\ No newline at end of file
+    cmd = f"cp -r {TEMPLATE_DIR} {DOCS_DIR}"
+    os.system(cmd)
+    
\ No newline at end of file
diff --git a/methods/rna.txt b/methods/rna.txt
new file mode 100755
index 00000000..168a02a4
--- /dev/null
+++ b/methods/rna.txt
@@ -0,0 +1,10 @@
+Single-cell transcriptomics and analysis
+
+Raw reads were processed to generate gene expression profiles using CeleScope v1.2.0(Singleron Biotechnologies) 
+with default parameters. Briefly, Barcodes and UMIs were extracted from R1 reads and corrected. Adapter sequences and poly A tails were trimmed from R2 reads and the trimmed R2 reads were aligned against the {GRCh38 (hg38)} {GRCm38 (mm10)} transcriptome 
+using STAR(v2.6.1b). Uniquely mapped reads were then assigned to exons with FeatureCounts(v2.0.1). Successfully Assigned Reads with the same cell barcode, UMI and gene were grouped together to generate the gene expression matrix for further analysis.
+
+Single-cell analyses were performed using the Seurat package(v4.0.1). All the variable genes selected by the FindVariableFeatures function were used to compute the PCs. The first 20 PCs and a resolution of 0.6 were used for clustering and tSNE visualization.
+
+
+
-- 
Gitee


From e28a9c82836225c917897a8bc35fada955301682 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 14:10:52 +0800
Subject: [PATCH 87/96] docs

---
 celescope/citeseq/multi_citeseq.py           |   6 +-
 celescope/hla/multi_hla.py                   |   4 +
 celescope/snp/multi_snp.py                   |  21 +++
 docs/CHANGELOG.md                            | 178 +++++++++++++++++++
 docs/CONTRIBUTING.md                         | 106 +++++++++++
 docs/capture_rna/count_capture_rna.md        |  61 +++++++
 docs/capture_rna/multi_capture_rna.md        | 106 +++++++++++
 docs/capture_virus/analysis_capture_virus.md |  19 ++
 docs/capture_virus/count_capture_virus.md    |  19 ++
 docs/capture_virus/mkref.md                  |  15 ++
 docs/capture_virus/multi_capture_virus.md    |  94 ++++++++++
 docs/citeseq/analysis_cite.md                |  13 ++
 docs/citeseq/count_cite.md                   |  13 ++
 docs/citeseq/multi_citeseq.md                |  95 ++++++++++
 docs/dynaseq/conversion.md                   |  26 +++
 docs/dynaseq/multi_dynaseq.md                | 110 ++++++++++++
 docs/dynaseq/replace_tsne.md                 |  30 ++++
 docs/dynaseq/replacement.md                  |  36 ++++
 docs/dynaseq/subsitution.md                  |  20 +++
 docs/fusion/count_fusion.md                  |  23 +++
 docs/fusion/mkref.md                         |  24 +++
 docs/fusion/multi_fusion.md                  |  90 ++++++++++
 docs/fusion/star_fusion.md                   |  32 ++++
 docs/hla/mapping_hla.md                      |  15 ++
 docs/hla/multi_hla.md                        |  73 ++++++++
 docs/manual.md                               |  41 +++++
 docs/manual_template.md                      |  23 +++
 docs/mut/count_mut.md                        |  17 ++
 docs/mut/mapping_mut.md                      |  17 ++
 docs/mut/multi_mut.md                        |  81 +++++++++
 docs/quick_start.md                          | 110 ++++++++++++
 docs/rna/analysis.md                         |  51 ++++++
 docs/rna/mkref.md                            |  38 ++++
 docs/rna/multi_rna.md                        | 106 +++++++++++
 docs/rna/star.md                             |  56 ++++++
 docs/rna_virus/analysis_rna_virus.md         |  17 ++
 docs/rna_virus/count_virus.md                |  17 ++
 docs/rna_virus/multi_rna_virus.md            |  92 ++++++++++
 docs/rna_virus/star_virus.md                 |  32 ++++
 docs/snp/analysis_snp.md                     |  23 +++
 docs/snp/mkref.md                            |  29 +++
 docs/snp/multi_snp.md                        |  97 ++++++++++
 docs/snp/variant_calling.md                  |  38 ++++
 docs/tag/analysis_tag.md                     |  19 ++
 docs/tag/count_tag.md                        |  44 +++++
 docs/tag/mapping_tag.md                      |  48 +++++
 docs/tag/multi_tag.md                        | 108 +++++++++++
 docs/tag/split_tag.md                        |  26 +++
 docs/tcr_fl/assemble.md                      |  15 ++
 docs/tcr_fl/multi_tcr_fl.md                  |  79 ++++++++
 docs/tcr_fl/split_fq.md                      |  15 ++
 docs/tools/barcode.md                        |  61 +++++++
 docs/tools/consensus.md                      |  24 +++
 docs/tools/count.md                          |  61 +++++++
 docs/tools/cutadapt.md                       |  44 +++++
 docs/tools/featureCounts.md                  |  38 ++++
 docs/tools/sample.md                         |  17 ++
 docs/tools/target_metrics.md                 |  28 +++
 docs/vdj/count_vdj.md                        |  37 ++++
 docs/vdj/mapping_vdj.md                      |  35 ++++
 docs/vdj/multi_vdj.md                        |  84 +++++++++
 generate_docs.py                             |  71 ++++++--
 62 files changed, 2949 insertions(+), 19 deletions(-)
 create mode 100755 docs/CHANGELOG.md
 create mode 100644 docs/CONTRIBUTING.md
 create mode 100644 docs/capture_rna/count_capture_rna.md
 create mode 100644 docs/capture_rna/multi_capture_rna.md
 create mode 100644 docs/capture_virus/analysis_capture_virus.md
 create mode 100644 docs/capture_virus/count_capture_virus.md
 create mode 100644 docs/capture_virus/mkref.md
 create mode 100644 docs/capture_virus/multi_capture_virus.md
 create mode 100644 docs/citeseq/analysis_cite.md
 create mode 100644 docs/citeseq/count_cite.md
 create mode 100644 docs/citeseq/multi_citeseq.md
 create mode 100644 docs/dynaseq/conversion.md
 create mode 100644 docs/dynaseq/multi_dynaseq.md
 create mode 100644 docs/dynaseq/replace_tsne.md
 create mode 100644 docs/dynaseq/replacement.md
 create mode 100644 docs/dynaseq/subsitution.md
 create mode 100644 docs/fusion/count_fusion.md
 create mode 100644 docs/fusion/mkref.md
 create mode 100644 docs/fusion/multi_fusion.md
 create mode 100644 docs/fusion/star_fusion.md
 create mode 100644 docs/hla/mapping_hla.md
 create mode 100644 docs/hla/multi_hla.md
 create mode 100644 docs/manual.md
 create mode 100644 docs/manual_template.md
 create mode 100644 docs/mut/count_mut.md
 create mode 100644 docs/mut/mapping_mut.md
 create mode 100644 docs/mut/multi_mut.md
 create mode 100755 docs/quick_start.md
 create mode 100644 docs/rna/analysis.md
 create mode 100644 docs/rna/mkref.md
 create mode 100644 docs/rna/multi_rna.md
 create mode 100644 docs/rna/star.md
 create mode 100644 docs/rna_virus/analysis_rna_virus.md
 create mode 100644 docs/rna_virus/count_virus.md
 create mode 100644 docs/rna_virus/multi_rna_virus.md
 create mode 100644 docs/rna_virus/star_virus.md
 create mode 100644 docs/snp/analysis_snp.md
 create mode 100644 docs/snp/mkref.md
 create mode 100644 docs/snp/multi_snp.md
 create mode 100644 docs/snp/variant_calling.md
 create mode 100644 docs/tag/analysis_tag.md
 create mode 100644 docs/tag/count_tag.md
 create mode 100644 docs/tag/mapping_tag.md
 create mode 100644 docs/tag/multi_tag.md
 create mode 100644 docs/tag/split_tag.md
 create mode 100644 docs/tcr_fl/assemble.md
 create mode 100644 docs/tcr_fl/multi_tcr_fl.md
 create mode 100644 docs/tcr_fl/split_fq.md
 create mode 100644 docs/tools/barcode.md
 create mode 100644 docs/tools/consensus.md
 create mode 100644 docs/tools/count.md
 create mode 100644 docs/tools/cutadapt.md
 create mode 100644 docs/tools/featureCounts.md
 create mode 100644 docs/tools/sample.md
 create mode 100644 docs/tools/target_metrics.md
 create mode 100644 docs/vdj/count_vdj.md
 create mode 100644 docs/vdj/mapping_vdj.md
 create mode 100644 docs/vdj/multi_vdj.md

diff --git a/celescope/citeseq/multi_citeseq.py b/celescope/citeseq/multi_citeseq.py
index 0ec65bc9..e9b10e67 100755
--- a/celescope/citeseq/multi_citeseq.py
+++ b/celescope/citeseq/multi_citeseq.py
@@ -1,6 +1,10 @@
+from celescope.tools.multi import Multi
 
 
-def main():
+class Multi_citeseq(Multi):
+    pass
 
+
+def main():
     # TODO
     pass
diff --git a/celescope/hla/multi_hla.py b/celescope/hla/multi_hla.py
index 21e44651..802fc981 100755
--- a/celescope/hla/multi_hla.py
+++ b/celescope/hla/multi_hla.py
@@ -1,3 +1,7 @@
+from celescope.tools.multi import Multi
+
+class Multi_hla(Multi):
+    pass
 
 def main():
     # TODO
diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index 69dbc418..b0d3eba1 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -3,6 +3,27 @@ from celescope.tools.multi import Multi
 
 
 class Multi_snp(Multi):
+    """
+    Usage
+    ```
+    multi_snp\
+        --mapfile ./test1.mapfile\
+        --genomeDir {genomeDir after running celescope snp mkref}\
+        --thread 10\
+        --mod shell\
+        --gene_list gene_list.tsv\
+        --annovar_config annovar.config\
+    ```
+    annovar_config file
+    ```
+    [ANNOVAR]
+    dir = /Public/Software/annovar/
+    db = /SGRNJ/Database/script/database/annovar/humandb
+    buildver = hg38
+    protocol = refGene,cosmic70
+    operation = g,f
+    ```
+    """
 
     def star(self, sample):
         step = 'star'
diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md
new file mode 100755
index 00000000..109dd6d7
--- /dev/null
+++ b/docs/CHANGELOG.md
@@ -0,0 +1,178 @@
+# Change Log
+
+## [unreleased] - 2021-06-09
+### Added
+
+### Changed
+
+### Fixed
+- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`.
+
+### Removed
+- `celescope.tools.utils.glob_genomeDir`
+
+## [1.3.1] - 2021-06-09
+### Added
+
+- Add wdl workflow.
+
+- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549.
+
+### Changed
+
+- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered)
+
+### Fixed and Removed
+
+- Remove h5 file generation in R to avoid memory issues.
+
+
+## [1.3.0] - 2021-05-28
+ 
+### Added
+
+- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details.
+
+### Changed
+
+- Change the way to handle duplicate gene_name and gene_id in gtf file.
+
+Previous:
+
+    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
+    - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name.
+    - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name.
+
+Now:
+
+    - one gene_name with multiple gene_id: "_{count}" will be added to gene_name.
+    - one gene_id with multiple gene_name: error.
+    - duplicated (gene_name, gene_id): ignore duplicated records and print a warning.
+
+### Fixed
+
+- Fix `count tag` metrics order in merge.xls
+
+### Removed
+
+- Remove `--fusion_pos` from `celescope.fusion.count_fusion`
+
+ 
+## [1.2.0] - 2021-05-19
+ 
+### Added
+
+- Assay `rna` outputs .h5 file in 06.analysis directory.
+
+### Changed
+
+- Update Seurat from 2.3.4 to 4.0.1.
+
+- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding.
+
+- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136
+
+### Removed
+
+- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory.
+
+ 
+## [1.1.9] - 2021-04-25
+ 
+### Added
+
+- Add parameter `--coefficient`  to `celescope tag count_tag` and `multi_tag`
+    
+    Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)`
+
+- Add `.metrics.json`
+
+- Add `scopeV1` chemistry support.
+ 
+### Changed
+  
+- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage).
+
+- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage.
+
+- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output.
+
+- Change the display of Barcode-rank plot in html report. 
+
+### Fixed
+
+- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2.
+
+- Fix an error when Numpy >= 1.2.0.
+
+- VDJ merge.xls can display all the metrics correctly.
+
+### Removed
+
+- Remove fastqc from `barcode` step.
+
+ 
+## [1.1.8] - 2021-03-26
+ 
+### Added
+
+- Add read consensus to VDJ pipeline. 
+
+    A consensus step was added before mapping to merge all the reads of the same
+    (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`.  
+    multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7.   
+
+- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`.
+
+    `--species` can be one of:
+    - `hs`: human
+    - `mmu`: mouse
+
+- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`.
+
+    `--cell_calling_method` can be one of:  
+    - `auto`: Same result as v1.1.7.  
+    - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3.  
+    - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points.  
+
+- Add 4 tags to featureCounts bam.
+
+    - `CB`: cell barcode
+    - `UB`: UMI
+    - `GN`: gene name
+    - `GX`: gene id
+
+- Add `--STAR_param` to `celescope rna STAR`
+
+    Additional parameters of STAR can be passed into the `STAR` step.
+
+### Changed
+
+- One sample can have different chemistry fastq in mapfile.  Version <= v1.1.7 will report this as an error.
+
+- Gtf file can be gzipped.
+
+- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` 
+
+- Step `snpCalling` use mutract.
+
+
+## [1.1.7] - 2020-12-16
+
+### Added
+
+- Automatically detect Singleron chemistry version.
+
+### Changed
+
+- FeatureCounts use strand specificity.
+
+- Cutadapt default `overlap` change from `5` to `10`.
+
+- VDJ sort `NA` last.
+
+- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID.
+
+
+
+
diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md
new file mode 100644
index 00000000..fb9ee986
--- /dev/null
+++ b/docs/CONTRIBUTING.md
@@ -0,0 +1,106 @@
+## Pull Requests
+Create pull requests to `dev` branch
+
+## Lint code
+Before pull requests, you should lint your code with the following command:
+```
+pip install pylint
+# lint
+# W1618 (no-absolute-import)
+# E1101 (no-member)
+# W1633 (round-builtin)
+# W1619 (old-division)
+# W0105 (String statement has no effect)
+# W0511 TODO!
+# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type)
+# W0212 Access to a protected member _option_string_actions of a client class (protected-access)
+pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope
+```
+Your code should be rated at 10(i.e. no error or warning). 
+
+## Write a new step
+When you add a new step, you need to
+  - Create a step class which inherit from `celescope.tools.step.Step`. 
+  - Create a function with the same name of the module. The main function `celescope` uses this function to run each step.
+  - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface.
+
+For example, in `celescope.tools.cutadapt`:
+
+```
+from celescope.tools.step import Step, s_common
+import celescope.tools.utils as utils
+
+
+class Cutadapt(Step):
+    """
+    Features
+    - Trim adapters in R2 reads with cutadapt. Default adapters includes:
+	- polyT=A{18}, 18 A bases. 
+	- p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
+
+    Output
+    - `cutadapt.log` Cutadapt output log file.
+    - `{sample}_clean_2.fq.gz` R2 reads file without adapters.
+    """
+
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+        {some init code}
+
+    @utils.add_log
+    def run(self):
+        {some code to run}
+
+
+@utils.add_log
+def cutadapt(args):
+    step_name = "cutadapt"
+    cutadapt_obj = Cutadapt(args, step_name)
+    cutadapt_obj.run()
+
+
+def get_opts_cutadapt(parser, sub_program):
+    parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.')
+    parser.add_argument(
+        '--minimum_length',
+        help='Default `20`. Discard processed reads that are shorter than LENGTH.', 
+        default=20
+    )
+    {other arguments}
+    if sub_program:
+        parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True)
+        parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true')
+        parser = s_common(parser)
+    return parser
+```
+
+## Docs
+There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect:
+- Docstring of the step class. The Docstring should have sections named `Features` and `Output`.
+- Help infomation in `get_opts_{module_name}`
+  
+Released assays will be added to `manual.md`.
+
+## Tests
+If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example:
+
+
+1. Get test data
+```
+# If you have access to Singleron Nanjing HPC
+copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir}
+# Or clone from repo
+git clone https://github.com/singleron-RD/celescope_tests.git
+```
+
+2. Run `pytest`
+```
+Install pytest
+>>> pip install pytest
+Run all
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir}
+Run some tests
+>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag
+```
+
+Then you need to create your own test based on this example.
\ No newline at end of file
diff --git a/docs/capture_rna/count_capture_rna.md b/docs/capture_rna/count_capture_rna.md
new file mode 100644
index 00000000..102eb020
--- /dev/null
+++ b/docs/capture_rna/count_capture_rna.md
@@ -0,0 +1,61 @@
+## Features
+- Cell-calling: Distinguish cell barcodes from background barcodes. 
+
+- Generate expression matrix.
+
+## Output
+- `{sample}_all_matrix` The expression matrix of all detected barcodes. 
+    Can be read in by calling the `Seurat::Read10X` function.
+
+- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. 
+Can be read in by calling the `Seurat::Read10X` function.
+
+- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. 
+CeleScope >=1.2.0 does not output this file.
+
+- `{sample}_count_detail.txt.gz` 4 columns: 
+    - barcode  
+    - gene ID  
+    - UMI count  
+    - read_count  
+
+- `{sample}_counts.txt` 6 columns:
+    - Barcode: barcode sequence
+    - readcount: read count of each barcode
+    - UMI2: UMI count (with reads per UMI >= 2) for each barcode
+    - UMI: UMI count for each barcode
+    - geneID: gene count for each barcode
+    - mark: cell barcode or backgound barcode.
+
+        `CB` cell  
+        `UB` background  
+
+- `{sample}_downsample.txt` 3 columns：
+    - percent: percentage of sampled reads
+    - median_geneNum: median gene number per cell
+    - saturation: sequencing saturation
+
+- `barcode_filter_magnitude.pdf` Barcode-UMI plot.
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--bam` Required. BAM file from featureCounts.
+
+`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%.
+
diff --git a/docs/capture_rna/multi_capture_rna.md b/docs/capture_rna/multi_capture_rna.md
new file mode 100644
index 00000000..a2c6067a
--- /dev/null
+++ b/docs/capture_rna/multi_capture_rna.md
@@ -0,0 +1,106 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--gtf_type` Specify feature type in GTF annotation
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
+`--genomeDir` Required. Genome directory.
+
+`--save_rds` Write rds to disk.
+
+`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```
+
diff --git a/docs/capture_virus/analysis_capture_virus.md b/docs/capture_virus/analysis_capture_virus.md
new file mode 100644
index 00000000..01aeb0c1
--- /dev/null
+++ b/docs/capture_virus/analysis_capture_virus.md
@@ -0,0 +1,19 @@
+
+
+## Arguments
+`--umi_threshold` method to find virus UMI threshold
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--match_dir` match_dir
+
+`--virus_file` virus UMI count file
+
diff --git a/docs/capture_virus/count_capture_virus.md b/docs/capture_virus/count_capture_virus.md
new file mode 100644
index 00000000..2789114d
--- /dev/null
+++ b/docs/capture_virus/count_capture_virus.md
@@ -0,0 +1,19 @@
+
+
+## Arguments
+`--min_query_length` minimum query length
+
+`--outdir` output dir
+
+`--assay` assay
+
+`--sample` sample name
+
+`--thread` None
+
+`--debug` debug
+
+`--match_dir` matched rna_virus directory
+
+`--virus_bam` None
+
diff --git a/docs/capture_virus/mkref.md b/docs/capture_virus/mkref.md
new file mode 100644
index 00000000..a3627da9
--- /dev/null
+++ b/docs/capture_virus/mkref.md
@@ -0,0 +1,15 @@
+
+
+## Arguments
+`--genomeDir` Default='./'. Output directory.
+
+`--thread` Default=6. Threads to use.
+
+`--genome_name` Required, genome name.
+
+`--dry_run` Only write config file and exit.
+
+`--fasta` virus fasta file
+
+`--genomeSAindexNbases` STAR genomeSAindexNbases
+
diff --git a/docs/capture_virus/multi_capture_virus.md b/docs/capture_virus/multi_capture_virus.md
new file mode 100644
index 00000000..95c3421d
--- /dev/null
+++ b/docs/capture_virus/multi_capture_virus.md
@@ -0,0 +1,94 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--threshold` Default 0.5. Valid base threshold.
+
+`--not_consensus` Skip the consensus step.
+
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--virus_genomeDir` virus genome dir
+
+`--min_query_length` minimum query length
+
+`--umi_threshold` method to find virus UMI threshold
+
diff --git a/docs/citeseq/analysis_cite.md b/docs/citeseq/analysis_cite.md
new file mode 100644
index 00000000..df9b21da
--- /dev/null
+++ b/docs/citeseq/analysis_cite.md
@@ -0,0 +1,13 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--sample` sample name
+
+`--match_dir` match_dir
+
+`--citeseq_mtx` citeseq matrix .gz file
+
+`--assay` assay
+
diff --git a/docs/citeseq/count_cite.md b/docs/citeseq/count_cite.md
new file mode 100644
index 00000000..55cbeaf4
--- /dev/null
+++ b/docs/citeseq/count_cite.md
@@ -0,0 +1,13 @@
+
+
+## Arguments
+`--match_dir` matched scRNA-Seq CeleScope directory path
+
+`--outdir` output dir
+
+`--sample` sample name
+
+`--assay` assay
+
+`--read_count_file` tag read count file
+
diff --git a/docs/citeseq/multi_citeseq.md b/docs/citeseq/multi_citeseq.md
new file mode 100644
index 00000000..6674538f
--- /dev/null
+++ b/docs/citeseq/multi_citeseq.md
@@ -0,0 +1,95 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases.         
+`L` linker(common sequences)  
+`C` tag barcode
+
+`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode 
+sequence in R2 reads with all tag barcode sequence in barcode_fasta. 
+It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. 
+If no such tag exists, the read is classified as invalid.
+```
+>tag_0
+GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
+>tag_1
+TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
+>tag_2
+AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
+>tag_3
+CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
+```
+
+`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads 
+with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid.
+
+`--match_dir` matched scRNA-Seq CeleScope directory path
+
diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md
new file mode 100644
index 00000000..bfb0cb2a
--- /dev/null
+++ b/docs/dynaseq/conversion.md
@@ -0,0 +1,26 @@
+## Features
+- Get conversion pos in each read.
+    - Get snp info. 
+
+## Output
+- `{sample}.PosTag.bam` Bam file with conversion info.
+- `{sample}.PosTag.csv` SNP info in csv format.
+
+
+## Arguments
+`--strand` gene strand file
+
+`--bam` featureCount bam
+
+`--cell` barcode cell list
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/multi_dynaseq.md b/docs/dynaseq/multi_dynaseq.md
new file mode 100644
index 00000000..b84dcd21
--- /dev/null
+++ b/docs/dynaseq/multi_dynaseq.md
@@ -0,0 +1,110 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--gtf_type` Specify feature type in GTF annotation
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
+`--genomeDir` Required. Genome directory.
+
+`--save_rds` Write rds to disk.
+
+`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```
+
+`--strand` gene strand file
+
+`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format
+
diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md
new file mode 100644
index 00000000..31ed90ce
--- /dev/null
+++ b/docs/dynaseq/replace_tsne.md
@@ -0,0 +1,30 @@
+## Features
+- Replace rate in each cluster
+- Top replace genes in each cluster
+
+## Output
+- `{sample}.rep_in_tsne.txt` Replace rate in each cluster.
+- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster.
+
+
+## Arguments
+`--tsne` tsne file
+
+`--mat` matrix rep file
+
+`--rep` cell rep file
+
+`--mincell` turn-over in at least cells, default 5
+
+`--topgene` top N genes,default 10
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md
new file mode 100644
index 00000000..1184777c
--- /dev/null
+++ b/docs/dynaseq/replacement.md
@@ -0,0 +1,36 @@
+## Features
+- Computes the replacement rates in each cell and gene.
+- Boxplots for rates distribution.
+
+## Output
+- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi.
+- `{sample}.new_matrix.tsv.gz` New RNA matrix.
+- `{sample}.old_matrix.tsv.gz` Old RNA matrix.
+- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell.
+- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene.
+- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene.
+
+
+## Arguments
+`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format
+
+`--bam` bam file
+
+`--bg` background snp file
+
+`--cell_keep` filter cell
+
+`--min_cell` a gene expressed in at least cells, default 10
+
+`--min_gene` at least gene num in a cell, default 10
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md
new file mode 100644
index 00000000..e2b7b169
--- /dev/null
+++ b/docs/dynaseq/subsitution.md
@@ -0,0 +1,20 @@
+## Features
+- Computes the overall conversion rates in reads and plots a barplot.
+
+## Output
+- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates.
+
+
+## Arguments
+`--bam` bam file
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/fusion/count_fusion.md b/docs/fusion/count_fusion.md
new file mode 100644
index 00000000..39063118
--- /dev/null
+++ b/docs/fusion/count_fusion.md
@@ -0,0 +1,23 @@
+
+
+## Arguments
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--bam` None
+
+`--match_dir` match scRNA-Seq dir
+
+`--fusion_genomeDir` fusion genome directory
+
+`--flanking_base` None
+
+`--UMI_min` None
+
diff --git a/docs/fusion/mkref.md b/docs/fusion/mkref.md
new file mode 100644
index 00000000..24ca414f
--- /dev/null
+++ b/docs/fusion/mkref.md
@@ -0,0 +1,24 @@
+
+
+## Arguments
+`--genomeDir` Default='./'. Output directory.
+
+`--thread` Default=6. Threads to use.
+
+`--genome_name` Required, genome name.
+
+`--dry_run` Only write config file and exit.
+
+`--fasta` fusion fasta file
+
+`--fusion_pos` fusion position file. A two column tab-delimited text file with header.
+"pos" is the end postion of the first gene(1-based).
+e.g.  
+tag	pos  
+PML_3	183  
+PML_4	254  
+PML_5	326  
+PML_6	204
+
+`--genomeSAindexNbases` STAR genomeSAindexNbases
+
diff --git a/docs/fusion/multi_fusion.md b/docs/fusion/multi_fusion.md
new file mode 100644
index 00000000..291a1756
--- /dev/null
+++ b/docs/fusion/multi_fusion.md
@@ -0,0 +1,90 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--fusion_genomeDir` fusion genome directory
+
+`--flanking_base` None
+
+`--UMI_min` None
+
diff --git a/docs/fusion/star_fusion.md b/docs/fusion/star_fusion.md
new file mode 100644
index 00000000..4ac60c75
--- /dev/null
+++ b/docs/fusion/star_fusion.md
@@ -0,0 +1,32 @@
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--fq` Required. R2 fastq file.
+
+`--consensus_fq` Input fastq has been consensused
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--fusion_genomeDir` fusion gene STAR index genome directory
+
diff --git a/docs/hla/mapping_hla.md b/docs/hla/mapping_hla.md
new file mode 100644
index 00000000..d4f5f1e3
--- /dev/null
+++ b/docs/hla/mapping_hla.md
@@ -0,0 +1,15 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--sample` sample name
+
+`--fq` None
+
+`--assay` assay
+
+`--match_dir` match scRNA-Seq dir
+
+`--thread` number of thread
+
diff --git a/docs/hla/multi_hla.md b/docs/hla/multi_hla.md
new file mode 100644
index 00000000..62c19f1b
--- /dev/null
+++ b/docs/hla/multi_hla.md
@@ -0,0 +1,73 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--match_dir` match scRNA-Seq dir
+
+`--thread` number of thread
+
diff --git a/docs/manual.md b/docs/manual.md
new file mode 100644
index 00000000..b39281ce
--- /dev/null
+++ b/docs/manual.md
@@ -0,0 +1,41 @@
+## Introduction
+CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria.
+
+Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming.
+
+Currently, CeleScope includes the follwing pipelines:
+
+- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional).
+
+- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting.
+
+- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification.
+
+
+## [Quick start](quick_start.md)
+
+## [Change log](CHANGELOG.md)
+
+## Pre-processing
+
+- [barcode](tools/barcode.md)
+- [cutadapt](tools/cutadapt.md)
+
+## Single-cell rna
+- [mkref](rna/mkref.md)
+- [star](rna/star.md)
+- [featureCounts](tools/featureCounts.md)
+- [count](tools/count.md)
+- [analysis](rna/analysis.md)
+- [multi_rna](rna/multi_rna.md)
+## Single-cell vdj
+- [consensus](tools/consensus.md)
+- [mapping_vdj](vdj/mapping_vdj.md)
+- [count_vdj](vdj/count_vdj.md)
+- [multi_vdj](vdj/multi_vdj.md)
+## Single-cell tag
+- [mapping_tag](tag/mapping_tag.md)
+- [count_tag](tag/count_tag.md)
+- [analysis_tag](tag/analysis_tag.md)
+- [split_tag](tag/split_tag.md)
+- [multi_tag](tag/multi_tag.md)
diff --git a/docs/manual_template.md b/docs/manual_template.md
new file mode 100644
index 00000000..c524de94
--- /dev/null
+++ b/docs/manual_template.md
@@ -0,0 +1,23 @@
+## Introduction
+CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria.
+
+Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming.
+
+Currently, CeleScope includes the follwing pipelines:
+
+- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional).
+
+- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting.
+
+- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification.
+
+
+## [Quick start](quick_start.md)
+
+## [Change log](CHANGELOG.md)
+
+## Pre-processing
+
+- [barcode](tools/barcode.md)
+- [cutadapt](tools/cutadapt.md)
+
diff --git a/docs/mut/count_mut.md b/docs/mut/count_mut.md
new file mode 100644
index 00000000..38699f30
--- /dev/null
+++ b/docs/mut/count_mut.md
@@ -0,0 +1,17 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--sample` sample name
+
+`--bam` None
+
+`--assay` assay
+
+`--mut_file` mutation file
+
+`--match_dir` match scRNA-Seq dir
+
+`--shift_base` None
+
diff --git a/docs/mut/mapping_mut.md b/docs/mut/mapping_mut.md
new file mode 100644
index 00000000..afccd1b6
--- /dev/null
+++ b/docs/mut/mapping_mut.md
@@ -0,0 +1,17 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--sample` sample name
+
+`--fq` None
+
+`--assay` assay
+
+`--indel_genomeDir` insertion or deletion STAR indexed genome directory
+
+`--thread` STAR thread
+
+`--outFilterMatchNmin` STAR outFilterMatchNmin
+
diff --git a/docs/mut/multi_mut.md b/docs/mut/multi_mut.md
new file mode 100644
index 00000000..23111560
--- /dev/null
+++ b/docs/mut/multi_mut.md
@@ -0,0 +1,81 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--indel_genomeDir` insertion or deletion STAR indexed genome directory
+
+`--thread` STAR thread
+
+`--outFilterMatchNmin` STAR outFilterMatchNmin
+
+`--mut_file` mutation file
+
+`--match_dir` match scRNA-Seq dir
+
+`--shift_base` None
+
diff --git a/docs/quick_start.md b/docs/quick_start.md
new file mode 100755
index 00000000..57d2327f
--- /dev/null
+++ b/docs/quick_start.md
@@ -0,0 +1,110 @@
+# Quick start
+
+CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of:
+
+- rna
+- vdj
+- tag
+
+Run `multi_{assay} -h` for help.
+
+
+## Usage Example
+
+- Single-cell rna
+
+	```
+	conda activate celescope
+	multi_rna\
+ 	--mapfile ./rna.mapfile\
+ 	--genomeDir /SGRNJ/Public/Database/genome/homo_mus\
+ 	--thread 8\
+ 	--mod shell
+ 	```
+`--mapfile` Required. Mapfile path.
+
+`--genomeDir` Required. Required. Genome directory.
+
+`--thread` The recommended setting is 8, and the maximum should not exceed 20.
+
+`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. 
+
+Scripts above will generate a `shell` directory containing `{sample}.sh` files.
+
+You can start your analysis by running:
+```
+sh ./shell/{sample}.sh
+```
+
+- Single cell vdj
+
+```
+conda activate celescope
+multi_vdj \
+ --mapfile ./vdj.mapfile \
+ --type TCR \
+ --thread 8 \
+ --mod shell
+```  
+
+`--type` Required. TCR or BCR. 
+
+- Single cell tag
+
+```
+conda activate celescope
+multi_tag \
+ --mapfile ./tag.mapfile\
+ --barcode_fasta ./smk_barcode.fa\
+ --fq_pattern L25C45\
+ --mod shell
+```  
+
+`--barcode_fasta` Required. Tag barcode fasta file.
+```
+>tag_0
+GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
+>tag_1
+TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
+>tag_2
+AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
+>tag_3
+CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
+```  
+
+`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. 
+
+`L` linker(common sequences)  
+`C` tag barcode  
+
+## How to write mapfile
+
+Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files.
+
+1st column: Fastq file prefix.  
+2nd column: Fastq file directory path.  
+3rd column: Sample name, which is the prefix of all output files.  
+4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`.
+- `rna` Optional, forced cell number.
+- `vdj` Optional, matched_dir.
+- `tag` Required, matched_dir.
+
+### Example
+
+Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1.
+```
+$cat ./my.mapfile
+fastq_prefix1	fastq_dir1	sample1
+fastq_prefix2	fastq_dir2	sample1
+fastq_prefix3	fastq_dir1	sample2
+
+$ls fastq_dir1
+fastq_prefix1_1.fq.gz	fastq_prefix1_2.fq.gz
+fastq_prefix3_1.fq.gz	fastq_prefix3_2.fq.gz
+
+$ls fastq_dir2
+fastq_prefix2_1.fq.gz	fastq_prefix2_2.fq.gz
+```
+
+
+ 
diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md
new file mode 100644
index 00000000..9ddfd1b3
--- /dev/null
+++ b/docs/rna/analysis.md
@@ -0,0 +1,51 @@
+## Features
+- Cell clustering with Seurat.
+
+- Calculate the marker gene of each cluster.
+
+- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster.
+
+## Output
+- `markers.tsv` Marker genes of each cluster.
+
+- `tsne_coord.tsv` t-SNE coordinates and clustering information.
+
+- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` 
+parameter is provided. The result contains 3 files:
+    - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", 
+it means that the given marker is not enough to identify the cluster.
+    - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters.
+    - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1)
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--save_rds` Write rds to disk.
+
+`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```
+
+`--matrix_file` Required. Matrix_10X directory from step count.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md
new file mode 100644
index 00000000..c1b3d592
--- /dev/null
+++ b/docs/rna/mkref.md
@@ -0,0 +1,38 @@
+## Features
+- Create a genome reference directory.
+
+## Output
+
+- STAR genome index files
+
+- Genome refFlat file
+
+- Genome config file
+```
+$ cat celescope_genome.config
+[genome]
+genome_name = Homo_sapiens_ensembl_99
+genome_type = rna
+fasta = Homo_sapiens.GRCh38.dna.primary_assembly.fa
+gtf = Homo_sapiens.GRCh38.99.gtf
+refflat = Homo_sapiens_ensembl_99.refFlat
+```
+
+
+## Arguments
+`--genomeDir` Default='./'. Output directory.
+
+`--thread` Default=6. Threads to use.
+
+`--genome_name` Required, genome name.
+
+`--dry_run` Only write config file and exit.
+
+`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir.
+
+`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir.
+
+`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir.
+It is a plain text file with one gene per line. 
+If not provided, will use `MT-` and `mt-` to determine mitochondria genes.
+
diff --git a/docs/rna/multi_rna.md b/docs/rna/multi_rna.md
new file mode 100644
index 00000000..a2c6067a
--- /dev/null
+++ b/docs/rna/multi_rna.md
@@ -0,0 +1,106 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--gtf_type` Specify feature type in GTF annotation
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
+`--genomeDir` Required. Genome directory.
+
+`--save_rds` Write rds to disk.
+
+`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example:
+```
+cell_type	marker
+Alveolar	"CLDN18,FOLR1,AQP4,PEBP4"
+Endothelial	"CLDN5,FLT1,CDH5,RAMP2"
+Epithelial	"CAPS,TMEM190,PIFO,SNTN"
+Fibroblast	"COL1A1,DCN,COL1A2,C1R"
+B_cell	"CD79A,IGKC,IGLC3,IGHG3"
+Myeloid	"LYZ,MARCO,FCGR3A"
+T_cell	"CD3D,TRBC1,TRBC2,TRAC"
+LUAD	"NKX2-1,NAPSA,EPCAM"
+LUSC	"TP63,KRT5,KRT6A,KRT6B,EPCAM"
+```
+
diff --git a/docs/rna/star.md b/docs/rna/star.md
new file mode 100644
index 00000000..ec3b5211
--- /dev/null
+++ b/docs/rna/star.md
@@ -0,0 +1,56 @@
+## Features
+- Align R2 reads to the reference genome with STAR.
+- Collect Metrics with Picard.
+
+## Output
+- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads.
+
+- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format.
+
+- `{sample}_Log.out` Main log with a lot of detailed information about the run. 
+This is most useful for troubleshooting and debugging.
+
+- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, 
+% of mapped reads etc. It is updated in 1 minute intervals.
+
+- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, 
+very useful for quality control. The statistics are calculated for each read (single- or paired-end) and 
+then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, 
+(unlike the samtools agstat/idxstats, which count each mate separately). 
+Most of the information is collected about the UNIQUE mappers 
+(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). 
+Each splicing is counted in the numbers of splices, which would correspond to 
+summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, 
+i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases.
+
+- `{sample}_region.log` Picard CollectRnaSeqMetrics results.
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--fq` Required. R2 fastq file.
+
+`--consensus_fq` Input fastq has been consensused
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/rna_virus/analysis_rna_virus.md b/docs/rna_virus/analysis_rna_virus.md
new file mode 100644
index 00000000..8893b4e1
--- /dev/null
+++ b/docs/rna_virus/analysis_rna_virus.md
@@ -0,0 +1,17 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--assay` assay
+
+`--sample` sample name
+
+`--thread` None
+
+`--debug` debug
+
+`--matrix_file` matrix file
+
+`--virus_file` virus UMI count file
+
diff --git a/docs/rna_virus/count_virus.md b/docs/rna_virus/count_virus.md
new file mode 100644
index 00000000..60f09d5c
--- /dev/null
+++ b/docs/rna_virus/count_virus.md
@@ -0,0 +1,17 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--assay` assay
+
+`--sample` sample name
+
+`--thread` None
+
+`--debug` debug
+
+`--virus_bam` None
+
+`--barcode_file` None
+
diff --git a/docs/rna_virus/multi_rna_virus.md b/docs/rna_virus/multi_rna_virus.md
new file mode 100644
index 00000000..26bf08ff
--- /dev/null
+++ b/docs/rna_virus/multi_rna_virus.md
@@ -0,0 +1,92 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--virus_genomeDir` virus genome dir
+
+`--gtf_type` Specify feature type in GTF annotation
+
+`--genomeDir` Required. Genome directory.
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
diff --git a/docs/rna_virus/star_virus.md b/docs/rna_virus/star_virus.md
new file mode 100644
index 00000000..7ef14bd0
--- /dev/null
+++ b/docs/rna_virus/star_virus.md
@@ -0,0 +1,32 @@
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--fq` Required. R2 fastq file.
+
+`--consensus_fq` Input fastq has been consensused
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--virus_genomeDir` virus genome dir
+
diff --git a/docs/snp/analysis_snp.md b/docs/snp/analysis_snp.md
new file mode 100644
index 00000000..fb2bd136
--- /dev/null
+++ b/docs/snp/analysis_snp.md
@@ -0,0 +1,23 @@
+
+
+## Arguments
+`--annovar_config` annovar soft config file
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--match_dir` match_dir
+
+`--vcf` vcf file
+
+`--CID_file` CID_file
+
+`--variant_count_file` variant count file
+
diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md
new file mode 100644
index 00000000..34858a56
--- /dev/null
+++ b/docs/snp/mkref.md
@@ -0,0 +1,29 @@
+## Features
+- Create dictionary file and fasta index for gatk SplitNCigarReads.
+(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) 
+Need to run `celescope rna mkref` first
+
+## Output
+- fasta index
+- gatk dictionary file
+
+## Usage
+```
+# run celescope rna mkref first
+celescope snp mkref \
+ --genome_name Homo_sapiens_ensembl_99 \
+ --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa
+```
+
+
+## Arguments
+`--genomeDir` Default='./'. Output directory.
+
+`--thread` Default=6. Threads to use.
+
+`--genome_name` Required, genome name.
+
+`--dry_run` Only write config file and exit.
+
+`--fasta` fasta file
+
diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md
new file mode 100644
index 00000000..ee762e9a
--- /dev/null
+++ b/docs/snp/multi_snp.md
@@ -0,0 +1,97 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--threshold` Default 0.5. Valid base threshold.
+
+`--not_consensus` Skip the consensus step.
+
+`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases 
+is higher than or equal to this value.
+
+`--out_unmapped` Output unmapped reads
+
+`--STAR_param` Other STAR parameters
+
+`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most.
+
+`--starMem` Default `30`. Maximum memory that STAR can use.
+
+`--gtf_type` Specify feature type in GTF annotation
+
+`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
+
+`--genomeDir` Genome directory after running `mkref`.
+
+`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
+and use these variants as input vcf.
+
+`--annovar_config` annovar soft config file
+
diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md
new file mode 100644
index 00000000..aed2d6fa
--- /dev/null
+++ b/docs/snp/variant_calling.md
@@ -0,0 +1,38 @@
+## Features
+- Perform variant calling.
+
+## Output
+
+`{sample}_VID.tsv` A unique numeric ID is assigned for each variant.
+
+`{sample}_CID.tsv` A unique numeric ID is assigned for each cell.
+
+`{sample}_variant_count.tsv`  Reference and variant supporting reads/UMIs count.
+
+`{sample}_support.mtx` Support matrix, only high quality bases are considered.   
+0 : no reads/UMIs cover the position.  
+1 : all reads/UMIs at the position support the ref allele.  
+2 : all reads/UMIs at the position support the alt allele.  
+3 : one or more reads/UMIs support both the alt and the ref allele.  
+
+
+## Arguments
+`--genomeDir` Genome directory after running `mkref`.
+
+`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level 
+and use these variants as input vcf.
+
+`--bam` Input BAM file from step `target_metrics`.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tag/analysis_tag.md b/docs/tag/analysis_tag.md
new file mode 100644
index 00000000..da0f476e
--- /dev/null
+++ b/docs/tag/analysis_tag.md
@@ -0,0 +1,19 @@
+## Features
+- Combine scRNA-Seq clustering infromation with tag assignment.
+
+
+## Arguments
+`--tsne_tag_file` `{sample}_tsne_tag.tsv` from count_tag.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tag/count_tag.md b/docs/tag/count_tag.md
new file mode 100644
index 00000000..815b3eb1
--- /dev/null
+++ b/docs/tag/count_tag.md
@@ -0,0 +1,44 @@
+## Features
+- Assign tag to each cell barcode and summarize.
+
+## Output
+
+- `{sample}_umi_tag.tsv` 
+
+    `first column` cell barcode  
+    `last column`  assigned tag  
+    `columns between first and last` UMI count for each tag 
+
+- `{sample}_tsne_tag.tsv` it is `{sample}_umi_tag.tsv` with t-SNE coordinates, gene_counts and cluster infomation
+
+- `{sample}_cluster_count.tsv` cell barcode number assigned to *undeterminded*, *multiplet* and *each tag*
+
+
+## Arguments
+`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.
+
+`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag.
+
+`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. 
+Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*.
+
+`--combine_cluster` Conbine cluster tsv file.
+
+`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as 
+`SNR_min = max(median(SNRs) * coefficient, 2)`. 
+Smaller `coefficient` will cause less *multiplet* in the tag assignment.
+
+`--read_count_file` Tag read count file.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tag/mapping_tag.md b/docs/tag/mapping_tag.md
new file mode 100644
index 00000000..1f7ab1ae
--- /dev/null
+++ b/docs/tag/mapping_tag.md
@@ -0,0 +1,48 @@
+## Features
+- Align R2 reads to the tag barcode fasta.
+
+## Output
+
+- `{sample}_read_count.tsv` tab-delimited text file with 4 columns.
+
+    `barcode` cell barcode  
+    `tag_name`  tag name in barcode_fasta  
+    `UMI`   UMI sequence  
+    `read_count` read count per UMI  
+
+
+## Arguments
+`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases.         
+`L` linker(common sequences)  
+`C` tag barcode
+
+`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode 
+sequence in R2 reads with all tag barcode sequence in barcode_fasta. 
+It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. 
+If no such tag exists, the read is classified as invalid.
+```
+>tag_0
+GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
+>tag_1
+TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
+>tag_2
+AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
+>tag_3
+CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
+```
+
+`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads 
+with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--fq` R2 read fastq.
+
diff --git a/docs/tag/multi_tag.md b/docs/tag/multi_tag.md
new file mode 100644
index 00000000..58414ae6
--- /dev/null
+++ b/docs/tag/multi_tag.md
@@ -0,0 +1,108 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases.         
+`L` linker(common sequences)  
+`C` tag barcode
+
+`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode 
+sequence in R2 reads with all tag barcode sequence in barcode_fasta. 
+It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. 
+If no such tag exists, the read is classified as invalid.
+```
+>tag_0
+GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC
+>tag_1
+TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG
+>tag_2
+AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA
+>tag_3
+CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG
+```
+
+`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads 
+with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid.
+
+`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.
+
+`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag.
+
+`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. 
+Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*.
+
+`--combine_cluster` Conbine cluster tsv file.
+
+`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as 
+`SNR_min = max(median(SNRs) * coefficient, 2)`. 
+Smaller `coefficient` will cause less *multiplet* in the tag assignment.
+
+`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment.
+
diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md
new file mode 100644
index 00000000..5a43f7f8
--- /dev/null
+++ b/docs/tag/split_tag.md
@@ -0,0 +1,26 @@
+## Features
+- Split scRNA-Seq fastq according to tag assignment.
+
+## Output
+- `fastq/{tag}_{1,2}.fq` Fastq files of each tag.
+
+
+## Arguments
+`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment.
+
+`--umi_tag_file` UMI tag file.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--R1_read` R1 read path.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tcr_fl/assemble.md b/docs/tcr_fl/assemble.md
new file mode 100644
index 00000000..95662af1
--- /dev/null
+++ b/docs/tcr_fl/assemble.md
@@ -0,0 +1,15 @@
+
+
+## Arguments
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--fastq_dir` None
+
diff --git a/docs/tcr_fl/multi_tcr_fl.md b/docs/tcr_fl/multi_tcr_fl.md
new file mode 100644
index 00000000..e70531b4
--- /dev/null
+++ b/docs/tcr_fl/multi_tcr_fl.md
@@ -0,0 +1,79 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--match_dir` match scRNA-Seq dir
+
+`--nCell` select top N cell
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tcr_fl/split_fq.md b/docs/tcr_fl/split_fq.md
new file mode 100644
index 00000000..fe767158
--- /dev/null
+++ b/docs/tcr_fl/split_fq.md
@@ -0,0 +1,15 @@
+
+
+## Arguments
+`--outdir` output dir
+
+`--sample` sample name
+
+`--fq` None
+
+`--assay` assay
+
+`--match_dir` match scRNA-Seq dir
+
+`--nCell` select top N cell
+
diff --git a/docs/tools/barcode.md b/docs/tools/barcode.md
new file mode 100644
index 00000000..9f31bd94
--- /dev/null
+++ b/docs/tools/barcode.md
@@ -0,0 +1,61 @@
+## Features
+
+- Demultiplex barcodes.
+- Filter invalid R1 reads, which includes:
+    - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2.  
+    - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1.  
+    - Reads without polyT: the number of T bases in the defined polyT region is less than 10.
+    - Low quality reads: low sequencing quality in barcode and UMI regions.
+
+## Output
+
+- `01.barcode/{sample}_2.fq(.gz)` Demultiplexed R2 reads. Barcode and UMI are contained in the read name. The format of 
+the read name is `{barcode}_{UMI}_{read ID}`.
+
+
+## Arguments
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--fq1` R1 fastq file. Multiple files are separated by comma.
+
+`--fq2` R2 fastq file. Multiple files are separated by comma.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tools/consensus.md b/docs/tools/consensus.md
new file mode 100644
index 00000000..77e11286
--- /dev/null
+++ b/docs/tools/consensus.md
@@ -0,0 +1,24 @@
+## Features
+- Consensus all the reads of the same (barcode, UMI) combinations into one read(UMI).
+
+## Output
+- `{sample}_consensus.fq` Consensus fastq.
+
+
+## Arguments
+`--threshold` Default 0.5. Valid base threshold.
+
+`--not_consensus` Skip the consensus step.
+
+`--fq` Required. Fastq file.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tools/count.md b/docs/tools/count.md
new file mode 100644
index 00000000..102eb020
--- /dev/null
+++ b/docs/tools/count.md
@@ -0,0 +1,61 @@
+## Features
+- Cell-calling: Distinguish cell barcodes from background barcodes. 
+
+- Generate expression matrix.
+
+## Output
+- `{sample}_all_matrix` The expression matrix of all detected barcodes. 
+    Can be read in by calling the `Seurat::Read10X` function.
+
+- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. 
+Can be read in by calling the `Seurat::Read10X` function.
+
+- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. 
+CeleScope >=1.2.0 does not output this file.
+
+- `{sample}_count_detail.txt.gz` 4 columns: 
+    - barcode  
+    - gene ID  
+    - UMI count  
+    - read_count  
+
+- `{sample}_counts.txt` 6 columns:
+    - Barcode: barcode sequence
+    - readcount: read count of each barcode
+    - UMI2: UMI count (with reads per UMI >= 2) for each barcode
+    - UMI: UMI count for each barcode
+    - geneID: gene count for each barcode
+    - mark: cell barcode or backgound barcode.
+
+        `CB` cell  
+        `UB` background  
+
+- `{sample}_downsample.txt` 3 columns：
+    - percent: percentage of sampled reads
+    - median_geneNum: median gene number per cell
+    - saturation: sequencing saturation
+
+- `barcode_filter_magnitude.pdf` Barcode-UMI plot.
+
+
+## Arguments
+`--genomeDir` Required. Genome directory.
+
+`--expected_cell_num` Default `3000`. Expected cell number.
+
+`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--bam` Required. BAM file from featureCounts.
+
+`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%.
+
diff --git a/docs/tools/cutadapt.md b/docs/tools/cutadapt.md
new file mode 100644
index 00000000..e75d6e72
--- /dev/null
+++ b/docs/tools/cutadapt.md
@@ -0,0 +1,44 @@
+## Features
+- Trim adapters in R2 reads with cutadapt. Default adapters includes:
+    - polyT=A{18}, 18 A bases. 
+    - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter.
+
+## Output
+- `cutadapt.log` Cutadapt output log file.
+- `{sample}_clean_2.fq.gz` R2 reads file without adapters.
+
+
+## Arguments
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--fq` Required. R2 reads from step Barcode.
+
+`--gzip` Output gzipped fastq
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tools/featureCounts.md b/docs/tools/featureCounts.md
new file mode 100644
index 00000000..3822ca55
--- /dev/null
+++ b/docs/tools/featureCounts.md
@@ -0,0 +1,38 @@
+## Features
+
+- Assigning uniquely mapped reads to genomic features with FeatureCounts.
+
+## Output
+- `{sample}` Numbers of reads assigned to features (or meta-features).
+
+- `{sample}_summary` Stat info for the overall summrization results, including number of 
+successfully assigned reads and number of reads that failed to be assigned due to 
+various reasons (these reasons are included in the stat info).
+
+- `{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam` featureCounts output BAM, 
+sorted by coordinates；BAM file contains tags as following(Software Version>=1.1.8):
+    - CB cell barcode
+    - UB UMI
+    - GN gene name
+    - GX gene id
+
+- `{sample}_name_sorted.bam` featureCounts output BAM, sorted by read name.
+
+
+## Arguments
+`--gtf_type` Specify feature type in GTF annotation
+
+`--genomeDir` Required. Genome directory.
+
+`--input` Required. BAM file path.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/tools/sample.md b/docs/tools/sample.md
new file mode 100644
index 00000000..e6fb6ce3
--- /dev/null
+++ b/docs/tools/sample.md
@@ -0,0 +1,17 @@
+
+
+## Arguments
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--fq1` read1 fq file
+
+`--chemistry` chemistry version
+
diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md
new file mode 100644
index 00000000..d2fbaa04
--- /dev/null
+++ b/docs/tools/target_metrics.md
@@ -0,0 +1,28 @@
+## Features
+- Filter bam file
+    - Filter reads that are not cell-associated.
+    - Filter reads that are not mapped to target genes. 
+
+- Collect enrichment metrics.
+
+## Output
+- `filtered.bam` BAM file after filtering.
+
+
+## Arguments
+`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
+
+`--bam` Input bam file
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/vdj/count_vdj.md b/docs/vdj/count_vdj.md
new file mode 100644
index 00000000..bd10f86d
--- /dev/null
+++ b/docs/vdj/count_vdj.md
@@ -0,0 +1,37 @@
+## Features
+- Cell-calling based on barcode-UMI rank.    
+- Summarize clonetypes infomation.
+
+## Output
+- `{sample}_cell_confident.tsv` The clone type of VDJ cell barcode, each chain occupies one line.
+
+- `{sample}_cell_confident_count.tsv` The clone type of VDJ cell barcode, each cell occupies one line.
+
+- `{sample}_clonetypes.tsv` The count and percentage of each clonetypes of VDJ cell barcode.
+
+- `{sample}_match_clonetypes.tsv` When summarize clonetypes, only consider barcodes in the match scRNA-Seq library. 
+This file will only be produced when the `match_dir` parameter is provided.
+
+
+## Arguments
+`--type` Required. `TCR` or `BCR`.
+
+`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.
+
+`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. 
+For each (barcode, chain) combination, only UMI>=iUMI is considered valid.
+
+`--UMI_count_filter_file` Required. File from step mapping_vdj.
+
+`--match_dir` Match celescope scRNA-Seq directory.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/vdj/mapping_vdj.md b/docs/vdj/mapping_vdj.md
new file mode 100644
index 00000000..25bb304a
--- /dev/null
+++ b/docs/vdj/mapping_vdj.md
@@ -0,0 +1,35 @@
+## Features
+- Align R2 reads to IGMT(http://www.imgt.org/) database sequences with mixcr.
+
+## Output
+- `{sample}_consensus.fasta` Fasta file after UMI consensus.
+
+- `{sample}_UMI_count_unfiltered.tsv` UMI reading for each (barcode, chain, VJ_pair) combination.
+
+- `{sample}_UMI_count_filtered.tsv` For each (barcode, chain) combination, only the record with the 
+most VJ_pair UMI reads is kept.
+
+- `{sample}_align.txt` Result report.
+
+- `{sample}_alignments.txt` The alignment result of each UMI/read.
+
+
+## Arguments
+`--type` TCR or BCR
+
+`--species` Default `hs`. `hs`(human) or `mmu`(mouse).
+
+`--not_consensus` Input fastq is not consensused.
+
+`--fq` Required. Input fastq file.
+
+`--outdir` Output diretory.
+
+`--assay` Assay name.
+
+`--sample` Sample name.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
diff --git a/docs/vdj/multi_vdj.md b/docs/vdj/multi_vdj.md
new file mode 100644
index 00000000..2db8fd77
--- /dev/null
+++ b/docs/vdj/multi_vdj.md
@@ -0,0 +1,84 @@
+
+
+## Arguments
+`--mod` mod, sjm or shell
+
+`--mapfile` tsv file, 4 columns:
+                1st col: LibName;
+                2nd col: DataDir;
+                3rd col: SampleName;
+                4th col: optional;
+
+`--rm_files` remove redundant fq.gz and bam after running
+
+`--steps_run` Steps to run. Multiple Steps are separated by comma.
+
+`--outdir` Output directory.
+
+`--thread` Thread to use.
+
+`--debug` If this argument is used, celescope may output addtional file for debugging.
+
+`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of:  
+- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations.  
+- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries.  
+- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the 
+same time.
+
+`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number 
+        of bases.  
+- `C`: cell barcode  
+- `L`: linker(common sequences)  
+- `U`: UMI    
+- `T`: poly T
+
+`--whitelist` Cell barcode whitelist file path, one cell barcode per line.
+
+`--linker` Linker whitelist file path, one linker per line.
+
+`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases.
+
+`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI.
+
+`--nopolyT` Outputs R1 reads without polyT.
+
+`--noLinker` Outputs R1 reads without correct linker.
+
+`--allowNoPolyT` Allow valid reads without polyT.
+
+`--allowNoLinker` Allow valid reads without correct linker.
+
+`--gzip` Output gzipped fastq files.
+
+`--adapter_fasta` Addtional adapter fasta file.
+
+`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH.
+
+`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). 
+Some Illumina instruments use a two-color chemistry to encode the four bases. 
+This includes the NextSeq and the NovaSeq. 
+In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. 
+However, dark cycles also occur when sequencing “falls off” the end of the fragment.
+The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.
+
+`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence,
+short matches can occur by chance, leading to erroneously trimmed bases. 
+For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. 
+To reduce the number of falsely trimmed bases, the alignment algorithm requires that 
+at least {overlap} bases match between adapter and read.
+
+`--insert` Default `150`. Read2 insert length.
+
+`--threshold` Default 0.5. Valid base threshold.
+
+`--species` Default `hs`. `hs`(human) or `mmu`(mouse).
+
+`--not_consensus` Input fastq is not consensused.
+
+`--type` Required. `TCR` or `BCR`.
+
+`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.
+
+`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. 
+For each (barcode, chain) combination, only UMI>=iUMI is considered valid.
+
diff --git a/generate_docs.py b/generate_docs.py
index e4788abc..ddc2a421 100644
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -11,11 +11,12 @@ from celescope.__init__ import ASSAY_DICT, RELEASED_ASSAYS
 PRE_PROCESSING_STEPS = ('sample', 'barcode', 'cutadapt')
 DOCS_DIR = 'docs/'
 TEMPLATE_DIR = 'docs_template/'
-MANUAL_MD = f'{DOCS_DIR}/manual.md'
+MANUAL = f'{DOCS_DIR}/manual.md'
 MANUAL_TEMPLATE = f'{DOCS_DIR}/manual_template.md'
 
 
 def get_argument_docs_from_parser(parser):
+    argument_docs = ""
     for argument in parser._option_string_actions:
         if not argument in ['-h', '--help']:
             help_msg = parser._option_string_actions[argument].help
@@ -52,20 +53,20 @@ class Docs():
         self.assay = assay
 
         init_module = utils.find_assay_init(assay)
-        self.steps = init_module.__STEPS__
+        self.steps = init_module.__STEPS__.copy()
         self.steps.append(f'multi_{assay}')
-        folder = f'{DOCS_DIR}/{assay}/'
 
         self.out_md_dict = {}
         self.relative_md_path = {}
-        for step in self.steps:
-            self.out_md_dict[step] = f'{folder}/{step}.md'
-            self.relative_md_path[step] = f'{assay}/{step}.md'
+        self.release_bool = self.assay in RELEASED_ASSAYS
 
-        if not os.path.exists(folder):
-            os.system(f'mkdir -p {folder}')    
+        assay_dir = f'docs/{assay}'
+        if not os.path.exists(assay_dir):
+            os.system(f'mkdir -p {assay_dir}')    
 
+    @utils.add_log
     def get_argument_docs(self, step, step_module):
+        self.get_argument_docs.logger.info(step)
         if step.startswith("multi"):
             multi_class = getattr(step_module, f'Multi_{self.assay}')
             multi_obj = multi_class(self.assay)
@@ -75,11 +76,18 @@ class Docs():
             func_opts = getattr(step_module, f"get_opts_{step}")
             func_opts(parser, sub_program=True)
             argument_docs = get_argument_docs_from_parser(parser)
-        return argument_docs   
+        return argument_docs
 
 
     def write_step_doc(self, step):
+        """
+        folder: docs/folder/*.md
+        """
         step_module = utils.find_step_module(self.assay, step)
+        folder = step_module.__name__.split('.')[1]
+        self.out_md_dict[step] = f'docs/{folder}/{step}.md'
+        self.relative_md_path[step] = f'{folder}/{step}.md'
+
         class_docs = get_class_docs(step_module)
         argument_docs = self.get_argument_docs(step, step_module)
 
@@ -87,12 +95,24 @@ class Docs():
             out_file.write(class_docs)
             out_file.write(argument_docs)
 
-def write_step_in_manual(md_path, step, manual_handle):
-    """
-    - [mkref](rna/mkref.md)
-    """
-    if not step in PRE_PROCESSING_STEPS:
-        manual_handle.write(f'- [{step}]({md_path})\n')
+    def run(self):
+        if self.release_bool:
+            with open(MANUAL, 'a') as writer:
+                writer.write(f'## {ASSAY_DICT[self.assay]}\n')            
+
+        for step in self.steps:
+            self.write_step_doc(step)
+            if self.release_bool:
+                self.write_step_in_manual(step)
+
+
+    def write_step_in_manual(self, step):
+        """
+        - [mkref](rna/mkref.md)
+        """
+        if not step in PRE_PROCESSING_STEPS:
+            with open(MANUAL, 'a') as writer:
+                writer.write(f'- [{step}]({self.relative_md_path[step]})\n')
     
 
 """
@@ -129,8 +149,23 @@ def write_manual(md_path_dict):
                 write_step_in_manual(md_path, step, manual_handle)
 
 
+def main():
+    cmd = (
+        f"rm -r {DOCS_DIR};"
+        f"cp -r {TEMPLATE_DIR} {DOCS_DIR}"
+    )
+    os.system(cmd)
+
+    with open(MANUAL, 'w') as manual_handle:
+        with open(MANUAL_TEMPLATE, 'r') as manual_template:
+            manual_handle.write(manual_template.read())
+
+    for assay in ASSAY_DICT:
+        docs_obj = Docs(assay)
+        docs_obj.run()
+
+
+
 
 if __name__ == "__main__":
-    cmd = f"cp -r {TEMPLATE_DIR} {DOCS_DIR}"
-    os.system(cmd)
-    
\ No newline at end of file
+    main()
-- 
Gitee


From ff359a28a975736dc3d8d4191385ed103a07e8e9 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 14:16:44 +0800
Subject: [PATCH 88/96] update

---
 celescope/snp/multi_snp.py | 24 ++++++++++++------------
 docs/snp/multi_snp.md      | 19 +++++++++++++++++++
 generate_docs.py           |  9 ++++++++-
 3 files changed, 39 insertions(+), 13 deletions(-)

diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py
index b0d3eba1..7215ad08 100755
--- a/celescope/snp/multi_snp.py
+++ b/celescope/snp/multi_snp.py
@@ -6,22 +6,22 @@ class Multi_snp(Multi):
     """
     Usage
     ```
-    multi_snp\
-        --mapfile ./test1.mapfile\
-        --genomeDir {genomeDir after running celescope snp mkref}\
-        --thread 10\
-        --mod shell\
-        --gene_list gene_list.tsv\
-        --annovar_config annovar.config\
+    multi_snp\\
+        --mapfile ./test1.mapfile\\
+        --genomeDir {genomeDir after running celescope snp mkref}\\
+        --thread 10\\
+        --mod shell\\
+        --gene_list gene_list.tsv\\
+        --annovar_config annovar.config\\
     ```
     annovar_config file
     ```
     [ANNOVAR]
-    dir = /Public/Software/annovar/
-    db = /SGRNJ/Database/script/database/annovar/humandb
-    buildver = hg38
-    protocol = refGene,cosmic70
-    operation = g,f
+    dir = /Public/Software/annovar/  
+    db = /SGRNJ/Database/script/database/annovar/humandb  
+    buildver = hg38  
+    protocol = refGene,cosmic70  
+    operation = g,f  
     ```
     """
 
diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md
index ee762e9a..481b950f 100644
--- a/docs/snp/multi_snp.md
+++ b/docs/snp/multi_snp.md
@@ -1,3 +1,22 @@
+## Usage
+```
+multi_snp\
+    --mapfile ./test1.mapfile\
+    --genomeDir {genomeDir after running celescope snp mkref}\
+    --thread 10\
+    --mod shell\
+    --gene_list gene_list.tsv\
+    --annovar_config annovar.config\
+```
+annovar_config file
+```
+[ANNOVAR]
+dir = /Public/Software/annovar/  
+db = /SGRNJ/Database/script/database/annovar/humandb  
+buildver = hg38  
+protocol = refGene,cosmic70  
+operation = g,f  
+```
 
 
 ## Arguments
diff --git a/generate_docs.py b/generate_docs.py
index ddc2a421..de87484c 100644
--- a/generate_docs.py
+++ b/generate_docs.py
@@ -36,7 +36,14 @@ def get_class_docs(step_module):
         if class_obj.__module__ != step_module.__name__:
             continue
         doc = inspect.getdoc(class_obj)
-        if doc and "Features" in doc:
+        
+        write_bool = False
+        if doc:
+            for title in titles:
+                if title in doc:
+                    write_bool = True
+
+        if write_bool:
             for line in doc.split('\n'):
                 for title in titles:
                     if line.find(title) != -1:
-- 
Gitee


From b1173d0bcfc6f05c985535d608c5ba697ff2be2a Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 14:22:40 +0800
Subject: [PATCH 89/96] add dynaseq release doc

---
 celescope/__init__.py |  2 +-
 docs/manual.md        | 10 ++++++++++
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index e89d5496..2ac251a9 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -21,7 +21,7 @@ ASSAY_DICT = {
 
 ROOT_PATH = os.path.dirname(__file__)
 
-RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ]
+RELEASED_ASSAYS = ['rna', 'vdj', 'tag', 'dynaseq']
 
 HELP_DICT = {
     'match_dir': 'Match celescope scRNA-Seq directory.',
diff --git a/docs/manual.md b/docs/manual.md
index b39281ce..dbdc7542 100644
--- a/docs/manual.md
+++ b/docs/manual.md
@@ -39,3 +39,13 @@ Currently, CeleScope includes the follwing pipelines:
 - [analysis_tag](tag/analysis_tag.md)
 - [split_tag](tag/split_tag.md)
 - [multi_tag](tag/multi_tag.md)
+## Single Cell Dynaseq
+- [star](rna/star.md)
+- [featureCounts](tools/featureCounts.md)
+- [count](tools/count.md)
+- [analysis](rna/analysis.md)
+- [conversion](dynaseq/conversion.md)
+- [subsitution](dynaseq/subsitution.md)
+- [replacement](dynaseq/replacement.md)
+- [replace_tsne](dynaseq/replace_tsne.md)
+- [multi_dynaseq](dynaseq/multi_dynaseq.md)
-- 
Gitee


From 9e6fbe0d3f74aa81b184774f950237185c1989d8 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 14:24:00 +0800
Subject: [PATCH 90/96] typo

---
 celescope/__init__.py | 2 +-
 docs/manual.md        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index 2ac251a9..ceede280 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -16,7 +16,7 @@ ASSAY_DICT = {
     'tag': 'Single-cell tag',
     'citeseq': 'Single Cell CITE-Seq',
     'tcr_fl': 'Single Cell full length TCR',
-    'dynaseq': 'Single Cell Dynaseq'
+    'dynaseq': 'Single-cell dynaseq'
 }
 
 ROOT_PATH = os.path.dirname(__file__)
diff --git a/docs/manual.md b/docs/manual.md
index dbdc7542..6ffcec89 100644
--- a/docs/manual.md
+++ b/docs/manual.md
@@ -39,7 +39,7 @@ Currently, CeleScope includes the follwing pipelines:
 - [analysis_tag](tag/analysis_tag.md)
 - [split_tag](tag/split_tag.md)
 - [multi_tag](tag/multi_tag.md)
-## Single Cell Dynaseq
+## Single-cell dynaseq
 - [star](rna/star.md)
 - [featureCounts](tools/featureCounts.md)
 - [count](tools/count.md)
-- 
Gitee


From 534298a9fcbf923881d42317583c3fb7d18ee116 Mon Sep 17 00:00:00 2001
From: zhouyiqi <zhouyiqi@singleronbio.com>
Date: Fri, 25 Jun 2021 14:29:24 +0800
Subject: [PATCH 91/96] update docs

---
 celescope/__init__.py        | 3 ++-
 celescope/snp/mkref.py       | 3 ++-
 docs/snp/mkref.md            | 2 +-
 docs/snp/multi_snp.md        | 2 +-
 docs/tools/target_metrics.md | 2 +-
 5 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/celescope/__init__.py b/celescope/__init__.py
index ceede280..a4d2a5d0 100755
--- a/celescope/__init__.py
+++ b/celescope/__init__.py
@@ -25,8 +25,9 @@ RELEASED_ASSAYS = ['rna', 'vdj', 'tag', 'dynaseq']
 
 HELP_DICT = {
     'match_dir': 'Match celescope scRNA-Seq directory.',
-    'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.',
+    'gene_list': 'Required. Gene list file, one gene symbol per line. Only results of these genes are reported.',
     'genomeDir': 'Genome directory after running `mkref`.',
     'thread': 'Thread to use.',
     'debug': 'If this argument is used, celescope may output addtional file for debugging.',
+    'fasta': 'Required. Genome fasta file. Use relative path to `genomeDir`.',
 }
diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py
index e8659428..b4795e33 100644
--- a/celescope/snp/mkref.py
+++ b/celescope/snp/mkref.py
@@ -5,6 +5,7 @@ import subprocess
 import celescope.tools.utils as utils
 from celescope.tools.mkref import Mkref
 from celescope.tools.mkref import get_opts_mkref as opts
+from celescope.__init__ import HELP_DICT
 
 
 class Mkref_snp(Mkref):
@@ -78,4 +79,4 @@ def mkref(args):
 def get_opts_mkref(parser, sub_program):
     opts(parser, sub_program)
     if sub_program:
-        parser.add_argument("--fasta", help="fasta file", required=True)
+        parser.add_argument("--fasta", help=HELP_DICT['fasta'], required=True)
diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md
index 34858a56..323b3771 100644
--- a/docs/snp/mkref.md
+++ b/docs/snp/mkref.md
@@ -25,5 +25,5 @@ celescope snp mkref \
 
 `--dry_run` Only write config file and exit.
 
-`--fasta` fasta file
+`--fasta` Required. Genome fasta file. Use relative path to `genomeDir`.
 
diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md
index 481b950f..e6b6435a 100644
--- a/docs/snp/multi_snp.md
+++ b/docs/snp/multi_snp.md
@@ -105,7 +105,7 @@ is higher than or equal to this value.
 
 `--gtf_type` Specify feature type in GTF annotation
 
-`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
+`--gene_list` Required. Gene list file, one gene symbol per line. Only results of these genes are reported.
 
 `--genomeDir` Genome directory after running `mkref`.
 
diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md
index d2fbaa04..9e170bf9 100644
--- a/docs/tools/target_metrics.md
+++ b/docs/tools/target_metrics.md
@@ -10,7 +10,7 @@
 
 
 ## Arguments
-`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported.
+`--gene_list` Required. Gene list file, one gene symbol per line. Only results of these genes are reported.
 
 `--bam` Input bam file
 
-- 
Gitee


From 75446cbacb1288a8c86dcd65540c8ee85f7d9882 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 25 Jun 2021 18:19:30 +0800
Subject: [PATCH 92/96] add mapping

---
 celescope/templates/html/trust_vdj/base.html  |   4 +-
 ...mble_summary.html => mapping_summary.html} |   4 +-
 celescope/trust_vdj/__init__.py               |   3 +-
 .../{trust_assemble.py => assemble.py}        | 223 ++++++------------
 celescope/trust_vdj/convert.py                |   7 +-
 celescope/trust_vdj/mapping.py                | 122 ++++++++++
 celescope/trust_vdj/multi_trust_vdj.py        |  13 +-
 celescope/trust_vdj/res_filter.py             |   8 +-
 8 files changed, 222 insertions(+), 162 deletions(-)
 rename celescope/templates/html/trust_vdj/{trust_assemble_summary.html => mapping_summary.html} (93%)
 rename celescope/trust_vdj/{trust_assemble.py => assemble.py} (32%)
 create mode 100644 celescope/trust_vdj/mapping.py

diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html
index fcd8607c..36025b76 100644
--- a/celescope/templates/html/trust_vdj/base.html
+++ b/celescope/templates/html/trust_vdj/base.html
@@ -133,8 +133,8 @@
       {% include "html/trust_vdj/convert_summary.html"%}
       {% endif %}
 
-      {% if trust_assemble_summary is defined %}
-      {% include "html/trust_vdj/trust_assemble_summary.html"%}
+      {% if mapping_summary is defined %}
+      {% include "html/trust_vdj/mapping_summary.html"%}
       {% endif %}
 
       {% if res_filter_summary is defined %}
diff --git a/celescope/templates/html/trust_vdj/trust_assemble_summary.html b/celescope/templates/html/trust_vdj/mapping_summary.html
similarity index 93%
rename from celescope/templates/html/trust_vdj/trust_assemble_summary.html
rename to celescope/templates/html/trust_vdj/mapping_summary.html
index 0d18b19f..797e9199 100644
--- a/celescope/templates/html/trust_vdj/trust_assemble_summary.html
+++ b/celescope/templates/html/trust_vdj/mapping_summary.html
@@ -13,7 +13,7 @@
         <p><b>reads Mapped to TRB</b>: reads confidently mapped to TRB chain.</p>
     </div>
       <table style="float: left; margin-left: 0%; margin-right:3%; width: 47%">
-        {% for item in trust_assemble_summary %}
+        {% for item in mapping_summary %}
           {% if loop.index <= (loop.length+1)/2 %}
           <tr>
             {% for i in item %} 
@@ -25,7 +25,7 @@
       </table>
 
       <table style="float: left; margin-left: 3%; margin-right:0%; width: 47%">
-        {% for item in trust_assemble_summary %}
+        {% for item in mapping_summary %}
           {% if loop.index > (loop.length+1)/2 %}
           <tr>
             {% for i in item %} 
diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py
index f95b64a2..8b95beeb 100644
--- a/celescope/trust_vdj/__init__.py
+++ b/celescope/trust_vdj/__init__.py
@@ -1,6 +1,7 @@
 __STEPS__ = [
     'sample',
     'convert',
-    'trust_assemble',
+    'assemble',
+    'mapping',
     'res_filter']
 __ASSAY__ = 'trust_vdj'
diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/assemble.py
similarity index 32%
rename from celescope/trust_vdj/trust_assemble.py
rename to celescope/trust_vdj/assemble.py
index 8cf8f76a..6eb35684 100644
--- a/celescope/trust_vdj/trust_assemble.py
+++ b/celescope/trust_vdj/assemble.py
@@ -1,19 +1,20 @@
 import os
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
-from celescope.tracer_vdj.split_fastq import get_barcodes
-from celescope.tools.barcode import *
 import pysam
 import pandas as pd
 from collections import defaultdict
+import glob
+import re
+from Bio.Seq import Seq
 
 
-TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4'
+TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/'
 
-
-def count_fq(fq1):
+@utils.add_log
+def count_fq(fq):
     dic = defaultdict(list)
-    with pysam.FastxFile(fq1) as fq:
+    with pysam.FastxFile(fq) as fq:
         for entry in fq:
             attr = entry.sequence
             cb = attr[:24]
@@ -24,146 +25,85 @@ def count_fq(fq1):
             dic['seq_name'].append(name)
 
     count_df = pd.DataFrame(dic, columns=list(dic.keys()))
-    
-    return count_df
-
 
-@utils.add_log
-def match_barcodes(outdir, match_dir, Seqtype, fq1):
-    annotated_bcs = get_barcodes(match_dir, Seqtype)
-    bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode'])
-    count_df = count_fq(fq1)
+    return count_df
 
-    # count UMI
-    df_umi = count_df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'})
-    df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'})
-    df_umi = df_umi.sort_values(by='UMI', ascending=False)
-    df_umi.to_csv(f'{outdir}/count.txt', sep='\t', index=False)
 
-    df_n = pd.merge(bcs_df, count_df, on='barcode', how='inner')
-    seqnames = df_n['seq_name'].tolist()
-    seqlist = open(f'{outdir}/seqlist.txt', 'w')
-    for name in seqnames:
-        seqlist.write(str(name) + '\n')
+class Assemble(Step):
+    """
+    Features
 
+    - Get fq file
+    """
 
-def clean_fq(fq1, fq2, outdir, sample, species):
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
 
-    prefix = f'{outdir}/{sample}_clean'
+        self.outdir = args.outdir
+        self.fq1 = args.fq1
+        self.fq2 = args.fq2
+        self.sample = args.sample
+        self.species = args.species
+        self.speed_up = args.speed_up
+        self.match_dir = args.match_dir
+        self.cells = args.cells
 
-    cmd = (
-        f'/SGRNJ03/randd/zhouxin/software/TRUST4/fastq-extractor '
-        f'-t 10 -f /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa '
-        f'-o {prefix} --barcodeStart 0 --barcodeEnd 23 '
-        f'-u {fq2} '
-        f'--barcode {fq1}'
-    )
+    @utils.add_log
+    def get_barcodes(self):        
+        tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv')
+        tsne = tsne[0]
+        tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0)
+        barcodes = tsne_coord.index.tolist()
 
-    os.system(cmd)
+        # write barcodes
+        res = [] 
+        for barcode in barcodes:
+            barcode = Seq(barcode)
+            barcode_reversed = barcode.reverse_complement()
+            bc = str(barcode_reversed)
+            res.append(bc)
 
+        df = pd.DataFrame(res, columns=['barcode'])
 
-def mapping_summary(outdir, Seqtype, fq, species):
-    
-    stat_file = outdir + '/stat.txt'
+        return df
 
-    trust_assemble_summary = []
+    @utils.add_log
+    def cut_off(self):
+        barcodes = self.get_barcodes()
+        df = count_fq(self.fq1)
+        df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'})
+        df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'})
 
-    total_mapped = 0
+        df_umi = df_umi.sort_values(by='UMI', ascending=False)
+        df_umi = df_umi.reset_index()
 
-    #with pysam.FastxFile(fq) as fh:
-        #total_count = 0
-        #for entry in fh:
-            #total_count += 1
+        UMI_num = int(self.cells)
+        rank = UMI_num / 100
+        rank_UMI = df_umi.loc[rank, 'UMI']
+        UMI_min = int(rank_UMI / 10)
 
-    if Seqtype == 'TCR':
-        loci = ['TRA', 'TRB']
-        stat_string = 'All reads Mapped to TRA and TRB' 
+        df_umi_filtered = df_umi[df_umi.UMI >= UMI_min]
 
-    elif Seqtype == 'BCR':
-        loci = ['IGH', 'IGL', 'IGK']
-        stat_string = 'All reads Mapped to IGH, IGL and IGK'
+        df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner')
 
-    for locus in loci:
-        cmd = (
-            f'source activate bracer; '
-            f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
-            f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} '
-            f'-U {fq} '
-            f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1'
-        )
-        os.system(cmd)
-
-        with open(f'{outdir}/log') as fh:
-            for line in fh:
-                if 'reads; of these:' in line:
-                    attr = re.findall(r'\d+', line)
-                    total_count = int(attr[0])
-                if 'aligned exactly 1 time' in line:
-                    res = re.findall(r"\d+", line)
-                    item = f'Reads mapped to {locus}'
-                    count = int(res[0])
-                    total_mapped += count
-                    trust_assemble_summary.append({
-                        'item': item,
-                        'count': count,
-                        'total_count': total_count,
-                    })
-       # os.system(f'rm {outdir}/{locus}.sam')   
-
-    # total mapping
-    cmd = (
-            f'source activate bracer; '
-            f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
-            f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} '
-            f'-U {fq} '
-            f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1'        
-    )
-    os.system(cmd)
-    with open(f'{outdir}/log') as fh: 
-        for line in fh:
-            if 'reads; of these:' in line:
-                attr = re.findall(r'\d+', line)
-                total_count = int(attr[0])
-            if 'aligned exactly 1 time' in line:
-                res = re.findall(r"\d+", line)
-                count = int(res[0])
-                trust_assemble_summary.insert(0, {
-                    'item': stat_string,
-                    'count': count,
-                    'total_count': total_count,
-                })
-
-    os.system(f'rm {outdir}/*.sam')
-    os.system(f'rm {outdir}/log')
-
-    df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count'])
-
-    utils.gen_stat(df, stat_file)
-
-
-class Trust_assemble(Step):
-    """
-    Features
+        matched_barcodes = df_tmp.barcode.tolist()
+        with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh:
+            for barcode in matched_barcodes:
+                fh.write(str(barcode)+ '\n')
+        string = f'Get {len(matched_barcodes)} matched barcodes'
 
-    - Get fq file
-    """
+        Assemble.cut_off.logger.info(string)
 
-    def __init__(self, args, step_name):
-        Step.__init__(self, args, step_name)
+        df_all = pd.merge(df_tmp, df, on='barcode', how='outer')
+        seq_list = df_all['seq_name'].tolist()
 
-        self.outdir = args.outdir
-        self.match_dir = args.match_dir
-        self.Seqtype = args.Seqtype
-        self.fq1 = args.fq1
-        self.fq2 = args.fq2
-        self.sample = args.sample
-        self.species = args.species
-        self.speed_up = args.speed_up
+        with open(f'{self.outdir}/seqlist.txt', 'w') as fh:
+            for name in seq_list:
+                fh.write(str(name)+'\n')
 
     
     @utils.add_log
     def getFqfile(self):
-        match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1)
 
         cmd1 = (
             f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq'
@@ -179,19 +119,19 @@ class Trust_assemble(Step):
     @utils.add_log
     def run(self):
 
-        if not os.path.exists(f'{self.outdir}/{self.sample}_matched_R2.fq'):
-            self.getFqfile()
+        self.cut_off()
+        self.getFqfile()
 
         species = self.species
 
-        index_file = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa'
-        ref = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_IMGT+C.fa'
+        index_file = f'{TRUST}/index/{species}/{species}_ref.fa'
+        ref = f'{TRUST}/index/{species}/{species}_IMGT+C.fa'
 
         string1 = ''
         if self.speed_up:
             string1 = '--repseq '
         cmd = (
-            f'{TRUST} -t {self.thread} '
+            f'{TRUST}/run-trust4 -t {self.thread} '
             f'-u {self.outdir}/{self.sample}_matched_R2.fq '
             f'--barcode {self.outdir}/{self.sample}_matched_R1.fq '
             f'--barcodeRange 0 23 + '
@@ -201,7 +141,7 @@ class Trust_assemble(Step):
             f'-o {self.sample} --od {self.outdir}/TRUST4' 
         )
 
-        Trust_assemble.run.logger.info(cmd)
+        Assemble.run.logger.info(cmd)
 
         if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'):
             os.system(cmd)
@@ -209,32 +149,25 @@ class Trust_assemble(Step):
             #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
 
         # report
-        clean_fq(self.fq1, self.fq2, self.outdir, self.sample, species)
-
-        fq = f'{self.outdir}/{self.sample}_clean.fq'
-
-        mapping_summary(self.outdir, self.Seqtype, fq, species)
-
-        os.remove(f'{self.outdir}/seqlist.txt')
-
-        self.clean_up()
+        os.system(f'rm {self.outdir}/seqlist.txt')
 
 
 @utils.add_log
-def trust_assemble(args):
-    step_name = 'trust_assemble'
-    trust_assemble_obj = Trust_assemble(args, step_name)
-    trust_assemble_obj.run()
+def assemble(args):
+    step_name = 'assemble'
+    assemble_obj = Assemble(args, step_name)
+    assemble_obj.run()
 
 
-def get_opts_trust_assemble(parser, sub_program):
+def get_opts_assemble(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
         parser.add_argument('--fq1', help='R1 reads from barcode step', required=True)
         parser.add_argument('--fq2', help='R2 reads from barcode step', required=True)
-        parser.add_argument('--match_dir', help='match_dir', required=True)
-    parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+        parser.add_argument('--match_dir', help='rna analysis dir', required=True)
+
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
+    parser.add_argument('--cells', help='expected cell number', default=3000)
     parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true')       
 
 
diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py
index 4039a215..511e4bcb 100644
--- a/celescope/trust_vdj/convert.py
+++ b/celescope/trust_vdj/convert.py
@@ -1,12 +1,7 @@
 """barcode step."""
 
-import os
 import re
-import subprocess
-import sys
-import glob
-from collections import defaultdict, Counter
-from itertools import combinations, product
+from collections import Counter
 
 import pandas as pd
 import pysam
diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py
new file mode 100644
index 00000000..a303e46d
--- /dev/null
+++ b/celescope/trust_vdj/mapping.py
@@ -0,0 +1,122 @@
+import pandas as pd
+import glob
+from celescope.tools.Step import Step, s_common
+from celescope.tools import utils
+import os
+import re
+
+
+class Mapping(Step):
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.outdir = args.outdir
+        self.match_dir = args.match_dir
+        self.Seqtype = args.Seqtype
+        self.sample = args.sample
+        self.species = args.species
+
+    @utils.add_log
+    def align(self):
+        species = self.species
+        outdir = self.outdir
+        Seqtype = self.Seqtype
+        
+        stat_file = self.outdir + '/stat.txt'
+        fq = f'{outdir}/../02.assemble/TRUST4/{self.sample}_toassemble.fq'
+
+        mapping_summary = []
+
+        total_mapped = 0
+
+        #with pysam.FastxFile(fq) as fh:
+            #total_count = 0
+            #for entry in fh:
+                #total_count += 1
+
+        if Seqtype == 'TCR':
+            loci = ['TRA', 'TRB']
+            stat_string = 'All reads Mapped to TRA and TRB' 
+
+        elif Seqtype == 'BCR':
+            loci = ['IGH', 'IGL', 'IGK']
+            stat_string = 'All reads Mapped to IGH, IGL and IGK'
+
+        for locus in loci:
+            cmd = (
+                f'source activate bracer; '
+                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+                f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} '
+                f'-U {fq} '
+                f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1'
+            )
+            os.system(cmd)
+
+            with open(f'{outdir}/log') as fh:
+                for line in fh:
+                    if 'reads; of these:' in line:
+                        attr = re.findall(r'\d+', line)
+                        total_count = int(attr[0])
+                    if 'aligned exactly 1 time' in line:
+                        res = re.findall(r"\d+", line)
+                        item = f'Reads mapped to {locus}'
+                        count = int(res[0])
+                        total_mapped += count
+                        mapping_summary.append({
+                            'item': item,
+                            'count': count,
+                            'total_count': total_count,
+                        })
+        # os.system(f'rm {outdir}/{locus}.sam')   
+
+        # total mapping
+        cmd = (
+                f'source activate full_len_VDJ; '
+                f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 '
+                f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} '
+                f'-U {fq} '
+                f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1'        
+        )
+        os.system(cmd)
+        with open(f'{outdir}/log') as fh: 
+            for line in fh:
+                if 'reads; of these:' in line:
+                    attr = re.findall(r'\d+', line)
+                    total_count = int(attr[0])
+                if 'aligned exactly 1 time' in line:
+                    res = re.findall(r"\d+", line)
+                    count = int(res[0])
+                    mapping_summary.insert(0, {
+                        'item': stat_string,
+                        'count': count,
+                        'total_count': total_count,
+                    })
+
+        os.system(f'rm {outdir}/*.sam')
+        os.system(f'rm {outdir}/log')
+
+        df = pd.DataFrame(mapping_summary, columns=['item', 'count', 'total_count'])
+
+        utils.gen_stat(df, stat_file)
+
+    @utils.add_log
+    def run(self):
+        self.align()
+
+        self.clean_up()
+
+    
+def mapping(args):
+    step_name = 'mapping'
+    mapping_obj = Mapping(args, step_name)
+    mapping_obj.run()
+
+
+def get_opts_mapping(parser, sub_program):
+    if sub_program:
+        parser = s_common(parser)
+
+    parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
+    parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
+
+
diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py
index 8c89ad45..bd5ecc1e 100644
--- a/celescope/trust_vdj/multi_trust_vdj.py
+++ b/celescope/trust_vdj/multi_trust_vdj.py
@@ -15,8 +15,8 @@ class Multi_trust_vdj(Multi):
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
 
-    def trust_assemble(self, sample):
-        step = 'trust_assemble'
+    def assemble(self, sample):
+        step = 'assemble'
         cmd_line = self.get_cmd_line(step, sample)
         fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}'
         fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' 
@@ -29,6 +29,15 @@ class Multi_trust_vdj(Multi):
         self.process_cmd(cmd, step, sample, m=15, x=self.args.thread)
 
 
+    def mapping(self, sample):
+        step = 'mapping'
+        cmd_line = self.get_cmd_line(step, sample)
+        cmd = (
+            f'{cmd_line}'
+        )
+        self.process_cmd(cmd, step, sample, m=5, x=5)
+
+
     def res_filter(self, sample):
         step = 'res_filter'
         cmd_line = self.get_cmd_line(step, sample)
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index ed7a2720..eb69837f 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -63,13 +63,13 @@ def get_clone_table(df, Seqtype):
     if Seqtype == 'BCR':
         chains = ['IGH', 'IGL', 'IGK']
         paired_groups = ['IGH_IGL', 'IGH_IGK']
-    for chain in chains:
-        tmp = df[df['V'].str.contains(chain, na=False)]
+    for c in chains:
+        tmp = df[df['V'].str.contains(c, na=False)]
         tmp = tmp.set_index('barcode')
-        tmp = tmp.rename(columns=lambda x: f'{chain}_'+x)
+        tmp = tmp.rename(columns=lambda x: f'{c}_'+x)
 
         res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None')
-        group_type.append(f'{chain}_CDR3aa')
+        group_type.append(f'{c}_CDR3aa')
     
     Frequent = [''] * res.shape[0]
     res.insert(res.shape[1], 'Frequent', Frequent)
-- 
Gitee


From 58cd8ec57fe7d56a7ee33eb27e6923af2f0d7796 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Fri, 25 Jun 2021 20:06:30 +0800
Subject: [PATCH 93/96] change pipeline

---
 celescope/trust_vdj/__init__.py        |   1 +
 celescope/trust_vdj/assemble.py        | 109 +-------------------
 celescope/trust_vdj/mapping.py         |   6 +-
 celescope/trust_vdj/matching.py        | 135 +++++++++++++++++++++++++
 celescope/trust_vdj/multi_trust_vdj.py |  29 +++++-
 celescope/trust_vdj/res_filter.py      |  14 ++-
 6 files changed, 180 insertions(+), 114 deletions(-)
 create mode 100644 celescope/trust_vdj/matching.py

diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py
index 8b95beeb..0e8f35cf 100644
--- a/celescope/trust_vdj/__init__.py
+++ b/celescope/trust_vdj/__init__.py
@@ -1,6 +1,7 @@
 __STEPS__ = [
     'sample',
     'convert',
+    'matching',
     'assemble',
     'mapping',
     'res_filter']
diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py
index 6eb35684..6109d967 100644
--- a/celescope/trust_vdj/assemble.py
+++ b/celescope/trust_vdj/assemble.py
@@ -1,39 +1,17 @@
 import os
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
-import pysam
 import pandas as pd
-from collections import defaultdict
-import glob
-import re
-from Bio.Seq import Seq
 
 
 TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/'
 
-@utils.add_log
-def count_fq(fq):
-    dic = defaultdict(list)
-    with pysam.FastxFile(fq) as fq:
-        for entry in fq:
-            attr = entry.sequence
-            cb = attr[:24]
-            umi = attr[24:]
-            name = entry.name
-            dic['barcode'].append(cb)
-            dic['UMI'].append(umi)
-            dic['seq_name'].append(name)
-
-    count_df = pd.DataFrame(dic, columns=list(dic.keys()))
-
-    return count_df
-
 
 class Assemble(Step):
     """
     Features
 
-    - Get fq file
+    - Assemble TCR/BCR
     """
 
     def __init__(self, args, step_name):
@@ -45,83 +23,11 @@ class Assemble(Step):
         self.sample = args.sample
         self.species = args.species
         self.speed_up = args.speed_up
-        self.match_dir = args.match_dir
-        self.cells = args.cells
-
-    @utils.add_log
-    def get_barcodes(self):        
-        tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv')
-        tsne = tsne[0]
-        tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0)
-        barcodes = tsne_coord.index.tolist()
-
-        # write barcodes
-        res = [] 
-        for barcode in barcodes:
-            barcode = Seq(barcode)
-            barcode_reversed = barcode.reverse_complement()
-            bc = str(barcode_reversed)
-            res.append(bc)
-
-        df = pd.DataFrame(res, columns=['barcode'])
-
-        return df
-
-    @utils.add_log
-    def cut_off(self):
-        barcodes = self.get_barcodes()
-        df = count_fq(self.fq1)
-        df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'})
-        df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'})
-
-        df_umi = df_umi.sort_values(by='UMI', ascending=False)
-        df_umi = df_umi.reset_index()
-
-        UMI_num = int(self.cells)
-        rank = UMI_num / 100
-        rank_UMI = df_umi.loc[rank, 'UMI']
-        UMI_min = int(rank_UMI / 10)
-
-        df_umi_filtered = df_umi[df_umi.UMI >= UMI_min]
-
-        df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner')
-
-        matched_barcodes = df_tmp.barcode.tolist()
-        with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh:
-            for barcode in matched_barcodes:
-                fh.write(str(barcode)+ '\n')
-        string = f'Get {len(matched_barcodes)} matched barcodes'
-
-        Assemble.cut_off.logger.info(string)
-
-        df_all = pd.merge(df_tmp, df, on='barcode', how='outer')
-        seq_list = df_all['seq_name'].tolist()
-
-        with open(f'{self.outdir}/seqlist.txt', 'w') as fh:
-            for name in seq_list:
-                fh.write(str(name)+'\n')
-
-    
-    @utils.add_log
-    def getFqfile(self):
-
-        cmd1 = (
-            f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq'
-        )
-        os.system(cmd1)
-
-        cmd2 = (
-            f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq'
-        )
-        os.system(cmd2)
 
 
     @utils.add_log
     def run(self):
 
-        self.cut_off()
-        self.getFqfile()
-
         species = self.species
 
         index_file = f'{TRUST}/index/{species}/{species}_ref.fa'
@@ -132,8 +38,8 @@ class Assemble(Step):
             string1 = '--repseq '
         cmd = (
             f'{TRUST}/run-trust4 -t {self.thread} '
-            f'-u {self.outdir}/{self.sample}_matched_R2.fq '
-            f'--barcode {self.outdir}/{self.sample}_matched_R1.fq '
+            f'-u {self.fq2} '
+            f'--barcode {self.fq1} '
             f'--barcodeRange 0 23 + '
             f'-f {index_file} '
             f'--ref {ref} '
@@ -148,9 +54,6 @@ class Assemble(Step):
 
             #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
 
-        # report
-        os.system(f'rm {self.outdir}/seqlist.txt')
-
 
 @utils.add_log
 def assemble(args):
@@ -162,12 +65,10 @@ def assemble(args):
 def get_opts_assemble(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
-        parser.add_argument('--fq1', help='R1 reads from barcode step', required=True)
-        parser.add_argument('--fq2', help='R2 reads from barcode step', required=True)
-        parser.add_argument('--match_dir', help='rna analysis dir', required=True)
+        parser.add_argument('--fq1', help='R1 reads from match step', required=True)
+        parser.add_argument('--fq2', help='R2 reads from match step', required=True)
 
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
-    parser.add_argument('--cells', help='expected cell number', default=3000)
     parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true')       
 
 
diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py
index a303e46d..d187befa 100644
--- a/celescope/trust_vdj/mapping.py
+++ b/celescope/trust_vdj/mapping.py
@@ -11,19 +11,20 @@ class Mapping(Step):
         Step.__init__(self, args, step_name)
 
         self.outdir = args.outdir
-        self.match_dir = args.match_dir
         self.Seqtype = args.Seqtype
         self.sample = args.sample
         self.species = args.species
+        self.fq = args.fq
+
 
     @utils.add_log
     def align(self):
         species = self.species
         outdir = self.outdir
         Seqtype = self.Seqtype
+        fq = self.fq
         
         stat_file = self.outdir + '/stat.txt'
-        fq = f'{outdir}/../02.assemble/TRUST4/{self.sample}_toassemble.fq'
 
         mapping_summary = []
 
@@ -115,6 +116,7 @@ def mapping(args):
 def get_opts_mapping(parser, sub_program):
     if sub_program:
         parser = s_common(parser)
+        parser.add_argument('--fq', help='to assemble fastq', required=True)
 
     parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True)
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py
new file mode 100644
index 00000000..17c20d6f
--- /dev/null
+++ b/celescope/trust_vdj/matching.py
@@ -0,0 +1,135 @@
+import os
+from celescope.tools import utils
+from celescope.tools.Step import Step, s_common
+import pysam
+import pandas as pd
+from collections import defaultdict
+import glob
+import re
+from Bio.Seq import Seq
+
+
+@utils.add_log
+def count_fq(fq):
+    dic = defaultdict(list)
+    with pysam.FastxFile(fq) as fq:
+        for entry in fq:
+            attr = entry.sequence
+            cb = attr[:24]
+            umi = attr[24:]
+            name = entry.name
+            dic['barcode'].append(cb)
+            dic['UMI'].append(umi)
+            dic['seq_name'].append(name)
+
+    count_df = pd.DataFrame(dic, columns=list(dic.keys()))
+
+    return count_df
+
+
+class Matching(Step):
+    def __init__(self, args, step_name):
+        Step.__init__(self, args, step_name)
+
+        self.outdir = args.outdir
+        self.fq1 = args.fq1
+        self.fq2 = args.fq2
+        self.sample = args.sample
+        self.match_dir = args.match_dir
+        self.cells = args.cells
+
+
+    @utils.add_log
+    def get_barcodes(self):        
+        tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv')
+        tsne = tsne[0]
+        tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0)
+        barcodes = tsne_coord.index.tolist()
+
+        # write barcodes
+        res = [] 
+        for barcode in barcodes:
+            barcode = Seq(barcode)
+            barcode_reversed = barcode.reverse_complement()
+            bc = str(barcode_reversed)
+            res.append(bc)
+
+        df = pd.DataFrame(res, columns=['barcode'])
+
+        return df
+
+
+    @utils.add_log
+    def cut_off(self):
+        barcodes = self.get_barcodes()
+        df = count_fq(self.fq1)
+        df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'})
+        df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'})
+
+        df_umi = df_umi.sort_values(by='UMI', ascending=False)
+        df_umi = df_umi.reset_index()
+        df_umi.to_csv(f'{self.outdir}/count.txt', sep='\t', index=False)
+
+        UMI_num = int(self.cells)
+        rank = UMI_num / 100
+        rank_UMI = df_umi.loc[rank, 'UMI']
+        UMI_min = int(rank_UMI / 10)
+
+        df_umi_filtered = df_umi[df_umi.UMI >= UMI_min]
+
+        df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner')
+
+        matched_barcodes = df_tmp.barcode.tolist()
+        with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh:
+            for barcode in matched_barcodes:
+                fh.write(str(barcode)+ '\n')
+        string = f'Get {len(matched_barcodes)} matched barcodes'
+
+        Matching.cut_off.logger.info(string)
+
+        df_all = pd.merge(df_tmp, df, on='barcode', how='outer')
+        seq_list = df_all['seq_name'].tolist()
+
+        with open(f'{self.outdir}/seqlist.txt', 'w') as fh:
+            for name in seq_list:
+                fh.write(str(name)+'\n')
+
+    
+    @utils.add_log
+    def getFqfile(self):
+
+        cmd1 = (
+            f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq'
+        )
+        os.system(cmd1)
+
+        cmd2 = (
+            f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq'
+        )
+        os.system(cmd2)
+
+        os.system(f'rm {self.outdir}/seqlist.txt')
+
+
+    @utils.add_log
+    def run(self):
+        self.cut_off()
+        self.getFqfile()
+
+
+@utils.add_log
+def matching(args):
+    step_name = 'matching'
+    match_obj = Matching(args, step_name)
+    match_obj.run()
+
+
+def get_opts_matching(parser, sub_program):
+    if sub_program:
+        parser = s_common(parser)
+        parser.add_argument('--match_dir', help='rna analysis dir', required=True)
+        parser.add_argument('--fq1', help='R1 reads from convert step', required=True)
+        parser.add_argument('--fq2', help='R2 reads from convert step', required=True)
+    parser.add_argument('--cells', help='expected cell number', default=3000)
+
+
diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py
index bd5ecc1e..44e27e83 100644
--- a/celescope/trust_vdj/multi_trust_vdj.py
+++ b/celescope/trust_vdj/multi_trust_vdj.py
@@ -15,25 +15,40 @@ class Multi_trust_vdj(Multi):
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
 
-    def assemble(self, sample):
-        step = 'assemble'
+    def matching(self, sample):
+        step = 'matching'
         cmd_line = self.get_cmd_line(step, sample)
         fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}'
-        fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' 
+        fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}'
         cmd = (
             f'{cmd_line} '
             f'--fq1 {fq1} '
             f'--fq2 {fq2} '
             f'--match_dir {self.col4_dict[sample]}'
         )
+        self.process_cmd(cmd, step, sample, m=5, x=3)
+
+
+    def assemble(self, sample):
+        step = 'assemble'
+        cmd_line = self.get_cmd_line(step, sample)
+        fq1 = f'{self.outdir_dic[sample]["matching"]}/{sample}_matched_R1.fq'
+        fq2 = f'{self.outdir_dic[sample]["matching"]}/{sample}_matched_R2.fq' 
+        cmd = (
+            f'{cmd_line} '
+            f'--fq1 {fq1} '
+            f'--fq2 {fq2} '
+        )
         self.process_cmd(cmd, step, sample, m=15, x=self.args.thread)
 
 
     def mapping(self, sample):
         step = 'mapping'
         cmd_line = self.get_cmd_line(step, sample)
+        fq = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_toassemble.fq'
         cmd = (
-            f'{cmd_line}'
+            f'{cmd_line} '
+            f'--fq {fq}'
         )
         self.process_cmd(cmd, step, sample, m=5, x=5)
 
@@ -41,8 +56,14 @@ class Multi_trust_vdj(Multi):
     def res_filter(self, sample):
         step = 'res_filter'
         cmd_line = self.get_cmd_line(step, sample)
+        report = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_barcode_report.tsv'
+        fa = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_annot.fa'
+        count_file = f'{self.outdir_dic[sample]["matching"]}/count.txt'
         cmd = (
             f'{cmd_line} '
+            f'--report {report} '
+            f'--fa {fa} '
+            f'--count_file {count_file} '
         )
         self.process_cmd(cmd, step, sample, m=5, x=1)
 
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index eb69837f..fabd63af 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -137,12 +137,15 @@ class Res_filter(Step):
         self.sample = args.sample
         self.Seqtype = args.Seqtype
         self.full_length = args.full_length
+        self.report = args.report
+        self.fa = args.fa
+        self.count_file = args.count_file
 
 
     @utils.add_log
     def run(self):
-        barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv'
-        fa = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_annot.fa'
+        barcode_report = self.report
+        fa = self.fa
         df = beauty_report(barcode_report, fa)
 
         if self.full_length:
@@ -152,7 +155,7 @@ class Res_filter(Step):
         clones, res_filter_summary = get_clone_table(df, self.Seqtype)
 
         # plot barcode umi
-        count_file = f'{self.outdir}/../02.trust_assemble/count.txt'
+        count_file = self.count_file
         df_umi = pd.read_csv(count_file, sep='\t', index_col=False)
         cells = set(df['barcode'].tolist())
         df_umi['mark'] = df_umi['barcode'].apply(lambda x: 'CB' if (x in cells) else 'UB')
@@ -212,4 +215,7 @@ def get_opts_res_filter(parser, sub_program):
     parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True)
     parser.add_argument('--full_length', help='only output full length assembly', action='store_true')
     if sub_program:
-        parser = s_common(parser)
\ No newline at end of file
+        parser = s_common(parser)
+        parser.add_argument('--report', help='assemble report', required=True)
+        parser.add_argument('--fa', help='assembled fasta file', required=True)
+        parser.add_argument('--count_file', help='UMI count file', required=True)
\ No newline at end of file
-- 
Gitee


From 3e506454fdb030eaa67efb8763019f9000728081 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 28 Jun 2021 11:26:08 +0800
Subject: [PATCH 94/96] add func doc and add rerun opt

---
 celescope/trust_vdj/assemble.py        | 16 +++++++++++++---
 celescope/trust_vdj/convert.py         | 15 ++++++++++++++-
 celescope/trust_vdj/mapping.py         |  6 ++++++
 celescope/trust_vdj/matching.py        | 15 ++++++++++++++-
 celescope/trust_vdj/multi_trust_vdj.py |  6 +++---
 celescope/trust_vdj/res_filter.py      |  9 +++++++++
 6 files changed, 59 insertions(+), 8 deletions(-)

diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py
index 6109d967..c4e0abdb 100644
--- a/celescope/trust_vdj/assemble.py
+++ b/celescope/trust_vdj/assemble.py
@@ -11,7 +11,17 @@ class Assemble(Step):
     """
     Features
 
-    - Assemble TCR/BCR
+    - Assemble TCR/BCR seq data.
+
+    Output
+
+    - `03.assemble/{sample}_toassemble.fq` Reads to assemble.
+    - `03.assemble/{sample}_toassemble_bc.fa` Barcodes to assemble.
+    - `03.assemble/{sample}_cdr3.out` All assembled CDR3 output.
+    - `03.assemble/{sample}_barcode_report.tsv` Record chain information in each barcode.
+    - `03.assemble/{sample}_annot.fa` Assembled annotated contig sequences.
+    - `03.assemble/{sample}_assembled_reads.fa` Assembled raw reads.
+    - `03.assemble/{sample}_report.tsv` Record assembled CDR3 types and count.
     """
 
     def __init__(self, args, step_name):
@@ -44,12 +54,12 @@ class Assemble(Step):
             f'-f {index_file} '
             f'--ref {ref} '
             f'{string1}'
-            f'-o {self.sample} --od {self.outdir}/TRUST4' 
+            f'-o {self.sample} --od {self.outdir}' 
         )
 
         Assemble.run.logger.info(cmd)
 
-        if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'):
+        if not os.path.exists(f'{self.outdir}/{self.sample}_barcode_report.tsv'):
             os.system(cmd)
 
             #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py
index 511e4bcb..440fdfe7 100644
--- a/celescope/trust_vdj/convert.py
+++ b/celescope/trust_vdj/convert.py
@@ -16,8 +16,21 @@ from celescope.tools.Step import Step, s_common
 
 class Convert(Step):
 
-    '''convert step class
+    '''
+    Features
+
+    - Demultiplex barcodes.
+    - Filter invalid R1 reads, which includes:
+        - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2.  
+        - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1.  
+        - Reads without polyT: the number of T bases in the defined polyT region is less than 10.
+        - Low quality reads: low sequencing quality in barcode and UMI regions.
+
+    Output
+
+    - `01.convert/{sample}_2.fq(.gz)`, `01.convert/{sample}_2.fq(.gz)`. Barcode and UMI are contained in the R1 reads.
     '''   
+
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py
index d187befa..b6a5b09b 100644
--- a/celescope/trust_vdj/mapping.py
+++ b/celescope/trust_vdj/mapping.py
@@ -7,6 +7,12 @@ import re
 
 
 class Mapping(Step):
+    """
+    Features
+
+    - Calculate mapping rate of reads mapped to any V(D)J genes.
+    
+    """
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py
index 17c20d6f..2c5f1e33 100644
--- a/celescope/trust_vdj/matching.py
+++ b/celescope/trust_vdj/matching.py
@@ -28,6 +28,19 @@ def count_fq(fq):
 
 
 class Matching(Step):
+    """
+    Features
+
+    - Cut off V(D)J data by UMI count. Default value is 1/10 of the 30th barcode's UMIs ranked by UMI count.
+    - Match V(D)J barcodes after cut off with RNA cell barcodes.
+
+    Output
+
+    - `02.matching/count.txt`. Record the UMI count of each barcode in raw V(D)J data.
+    - `02.matching/{sample}_matched_barcodes.txt`. Contain the matched barcode.
+    - `02.matching/{sample}_matched_R1.fq`, `02.match/{sample}_matched_R2.fq. Barcode and UMI are contained in the R1 reads.
+
+    """
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
 
@@ -87,7 +100,7 @@ class Matching(Step):
 
         Matching.cut_off.logger.info(string)
 
-        df_all = pd.merge(df_tmp, df, on='barcode', how='outer')
+        df_all = pd.merge(df_tmp, df, on='barcode', how='inner')
         seq_list = df_all['seq_name'].tolist()
 
         with open(f'{self.outdir}/seqlist.txt', 'w') as fh:
diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py
index 44e27e83..bb331390 100644
--- a/celescope/trust_vdj/multi_trust_vdj.py
+++ b/celescope/trust_vdj/multi_trust_vdj.py
@@ -45,7 +45,7 @@ class Multi_trust_vdj(Multi):
     def mapping(self, sample):
         step = 'mapping'
         cmd_line = self.get_cmd_line(step, sample)
-        fq = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_toassemble.fq'
+        fq = f'{self.outdir_dic[sample]["assemble"]}//{sample}_toassemble.fq'
         cmd = (
             f'{cmd_line} '
             f'--fq {fq}'
@@ -56,8 +56,8 @@ class Multi_trust_vdj(Multi):
     def res_filter(self, sample):
         step = 'res_filter'
         cmd_line = self.get_cmd_line(step, sample)
-        report = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_barcode_report.tsv'
-        fa = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_annot.fa'
+        report = f'{self.outdir_dic[sample]["assemble"]}//{sample}_barcode_report.tsv'
+        fa = f'{self.outdir_dic[sample]["assemble"]}//{sample}_annot.fa'
         count_file = f'{self.outdir_dic[sample]["matching"]}/count.txt'
         cmd = (
             f'{cmd_line} '
diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py
index fabd63af..85b30209 100644
--- a/celescope/trust_vdj/res_filter.py
+++ b/celescope/trust_vdj/res_filter.py
@@ -129,6 +129,15 @@ def get_clone_table(df, Seqtype):
 
 
 class Res_filter(Step):
+    """
+    Features
+
+    - Calculate clonetypes.
+
+    Output
+    - `05.res_filter/clonetypes.tsv` Record each clonetype and its frequent.
+    - `05.res_filter/{sample}_barcode_report.tsv` Record detailed chain information of each barcode.
+    """
 
     def __init__(self, args, step_name):
         Step.__init__(self, args, step_name)
-- 
Gitee


From 1b29c57629bdeac5b9d96c6c828c5c1927397c9e Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 28 Jun 2021 11:26:32 +0800
Subject: [PATCH 95/96] add rerun option

---
 celescope/trust_vdj/assemble.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py
index c4e0abdb..2c6e4531 100644
--- a/celescope/trust_vdj/assemble.py
+++ b/celescope/trust_vdj/assemble.py
@@ -33,6 +33,7 @@ class Assemble(Step):
         self.sample = args.sample
         self.species = args.species
         self.speed_up = args.speed_up
+        self.rerun = args.rerun
 
 
     @utils.add_log
@@ -62,6 +63,9 @@ class Assemble(Step):
         if not os.path.exists(f'{self.outdir}/{self.sample}_barcode_report.tsv'):
             os.system(cmd)
 
+        if self.rerun:
+            os.system(cmd)
+
             #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq'
 
 
@@ -79,6 +83,7 @@ def get_opts_assemble(parser, sub_program):
         parser.add_argument('--fq2', help='R2 reads from match step', required=True)
 
     parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True)
+    parser.add_argument('--rerun', help='Re-run the assemble step', action='store_true')
     parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true')       
 
 
-- 
Gitee


From 09bf64bc3aad3129fddef68625fb48743fd11b52 Mon Sep 17 00:00:00 2001
From: zhouxinseeu <zhouxin@singleronbio.com>
Date: Mon, 28 Jun 2021 11:28:52 +0800
Subject: [PATCH 96/96] rm unused import

---
 celescope/trust_vdj/assemble.py | 1 -
 celescope/trust_vdj/mapping.py  | 1 -
 celescope/trust_vdj/matching.py | 1 -
 3 files changed, 3 deletions(-)

diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py
index 2c6e4531..d0c12d11 100644
--- a/celescope/trust_vdj/assemble.py
+++ b/celescope/trust_vdj/assemble.py
@@ -1,7 +1,6 @@
 import os
 from celescope.tools import utils
 from celescope.tools.Step import Step, s_common
-import pandas as pd
 
 
 TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/'
diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py
index b6a5b09b..52304187 100644
--- a/celescope/trust_vdj/mapping.py
+++ b/celescope/trust_vdj/mapping.py
@@ -1,5 +1,4 @@
 import pandas as pd
-import glob
 from celescope.tools.Step import Step, s_common
 from celescope.tools import utils
 import os
diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py
index 2c5f1e33..0646a420 100644
--- a/celescope/trust_vdj/matching.py
+++ b/celescope/trust_vdj/matching.py
@@ -5,7 +5,6 @@ import pysam
 import pandas as pd
 from collections import defaultdict
 import glob
-import re
 from Bio.Seq import Seq
 
 
-- 
Gitee