From 062403f6bfc0e0b8a68dfa4c1ccd78c13caa08e5 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Sat, 8 May 2021 00:44:33 +0800 Subject: [PATCH 01/96] add .h5 file in count.py --- celescope/tools/count.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index 902e7524..e7ed3167 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -12,6 +12,7 @@ import subprocess from scipy.io import mmwrite from scipy.sparse import csr_matrix, coo_matrix import pysam +import h5py from celescope.tools.utils import add_log, format_number, glob_genomeDir, gene_convert, s_common, add_mem from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3 from celescope.tools.__init__ import MATRIX_FILE_NAME, FEATURE_FILE_NAME, BARCODE_FILE_NAME @@ -262,6 +263,26 @@ def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None genes.to_csv(f'{matrix_10X_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False) barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t') mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx) + + bc_list = df_UMI.index.levels[1].tolist() + gene_name = list(genes['gene_name']) + gene_id = list(genes['gene_id']) + X_data, X_indices, X_indptr = df_UMI.UMI, df_UMI.index.labels[0], df_UMI.index.labels[1] + + f = h5py.File(f"{outdir}/{sample}.h5", "w") + dt = h5py.string_dtype(encoding='utf-8') + g1 = f.create_group('obs') + g1_d1 = g1.create_dataset('_index', data=bc_list, dtype=dt) + g2 = f.create_group('var') + g2_d1 = g2.create_dataset('_index', data=gene_name, dtype=dt) + g2_d2 = g2.create_dataset('gene_ids', data=gene_id, dtype=dt) + g3 = f.create_group('X') + g3_d1 = g3.create_dataset('data', data=X_data) + g3_d2 = g3.create_dataset('indices', data=X_indices) + g3_d3 = g3.create_dataset('indptr', data=X_indptr) + + f.close() + return matrix_10X_dir -- Gitee From 2c0e59ffd3f715ebe08e8782c4bd3ebc0e864640 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Sat, 8 May 2021 00:45:40 +0800 Subject: [PATCH 02/96] auto --- .DS_Store | Bin 0 -> 6148 bytes celescope/.DS_Store | Bin 0 -> 8196 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 .DS_Store create mode 100644 celescope/.DS_Store diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..fde5083c0902b4b0b5ff087a8db22e54d5043b62 GIT binary patch literal 6148 zcmeHK!EVz)5S>i}by5mBAkhnwFWjmqfeMKOQko`6k!Xc*1qVQ>otVbLwWHV}gbG!8usDUAJP!5 zWq6Zhs2N|qVHhN7t@ce6Hj0-nZ(3H_DsPuxID>fNq;5J+n(pWsUp;cufv4l+I1Ycu zuJvC^YRT;3){_JDDVdg@cLlG85|L_?uzjRHo2tO6VAvcmg+`~35NmSnDs z0!D%VN&!)9Ijts2(tGPlalF^s@Dn&2=PeXU3L3o~%Ye7y9k?`%*&F~H8W#%D12aDY NQU=o)1^%i6KLKwY+LZtR literal 0 HcmV?d00001 diff --git a/celescope/.DS_Store b/celescope/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..78b57bb8670a1b98f6c056cf4c4e0cce4fd8f49b GIT binary patch literal 8196 zcmeHML2nX46n+DgF4QWACUP;^i?^l$ZA^MFL9B<~OfY&-Ll;=fnq?Q4rKBQe@BRWk z_!s;g{sMoCC;i^cgfI&nycip2$jqDF{k}KzzL_$83lWL-bkHQK6On_;wzh|6L*eIK z*GfU(Ie-=569tq~7oQ^k4r0z9)0`fFdJ*zd8%!elLuBK=2>cBh;f` zvY^MTK6%LVQ<_pjK71tb;N@7~a;*I|)+I(w;A3Jk#ho1U2ADpDs1x*JjO@$z47quV zPdVN$(EMJ4c|N_%jq)>xF3|u{rTI7t7dc#+-te5^Ii|?;DV6%VqEwNFnD-`SOwHf1g4wrqe_dM`t-P*z9I7+=B z@)Jo820kFKUIft~p0(p?G?+*~9reI*Do&+atIy}n#_^$h+-xik-TBGA2Kx6OEEW~# z_TA>gv-8o_`}v2($In Date: Mon, 10 May 2021 13:47:31 +0800 Subject: [PATCH 03/96] rm .h5 --- celescope/tools/count.py | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index e7ed3167..61d2545b 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -264,24 +264,6 @@ def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t') mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx) - bc_list = df_UMI.index.levels[1].tolist() - gene_name = list(genes['gene_name']) - gene_id = list(genes['gene_id']) - X_data, X_indices, X_indptr = df_UMI.UMI, df_UMI.index.labels[0], df_UMI.index.labels[1] - - f = h5py.File(f"{outdir}/{sample}.h5", "w") - dt = h5py.string_dtype(encoding='utf-8') - g1 = f.create_group('obs') - g1_d1 = g1.create_dataset('_index', data=bc_list, dtype=dt) - g2 = f.create_group('var') - g2_d1 = g2.create_dataset('_index', data=gene_name, dtype=dt) - g2_d2 = g2.create_dataset('gene_ids', data=gene_id, dtype=dt) - g3 = f.create_group('X') - g3_d1 = g3.create_dataset('data', data=X_data) - g3_d2 = g3.create_dataset('indices', data=X_indices) - g3_d3 = g3.create_dataset('indptr', data=X_indptr) - - f.close() return matrix_10X_dir -- Gitee From 6b9005d7cd62cd5975569946ef611c369b96d7b3 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 10 May 2021 14:40:29 +0800 Subject: [PATCH 04/96] update to seurat4.0 and add h5 --- celescope/tools/auto_assign.R | 21 ++++----- celescope/tools/run_analysis.R | 78 ++++++++++++++++++++++------------ conda_pkgs.txt | 2 +- 3 files changed, 64 insertions(+), 37 deletions(-) diff --git a/celescope/tools/auto_assign.R b/celescope/tools/auto_assign.R index efe4a810..f33b8e37 100755 --- a/celescope/tools/auto_assign.R +++ b/celescope/tools/auto_assign.R @@ -31,7 +31,7 @@ n_cell_name <- length(cell_name) #reset #all_data <- SetAllIdent(object = all_data, id = origin.cluster) -clusters <- sort(unique(all_data@ident)) +clusters <- sort(unique(all_data@active.ident)) #create dir auto_dir <- stringr::str_glue('{outdir}/{sample}_auto_assign/') @@ -47,9 +47,9 @@ for (cluster in clusters){ index = index + 1 pos = unlist(strsplit(marker_file[index,2,drop=T],",")) neg = tryCatch(unlist(strsplit(marker_file[index,3,drop=T],",")) ,error=function(e){} ) - for (feature in pos){ + for (F in pos){ tryCatch({ - dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) + dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) dat$cell_type <- cell dat$cluster <- cluster dat <- rownames_to_column(dat,var="gene") @@ -61,13 +61,13 @@ for (cluster in clusters){ all_dat <- rbind(all_dat,dat) } } - ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) }) + ,error=function(e){print(paste0(F," not found in cluster ",cluster)) }) } if (!is.na(neg) && !is.null(neg)){ - for (feature in neg){ + for (F in neg){ tryCatch({ - dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) + dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) dat$cell_type <- cell dat$cluster <- cluster dat <- rownames_to_column(dat,var="gene") @@ -79,13 +79,14 @@ for (cluster in clusters){ all_dat <- rbind(all_dat,dat) } } - ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) }) + ,error=function(e){print(paste0(F," not found in cluster ",cluster)) }) } } } } +all_dat$cluster <- as.numeric(all_dat$cluster) + 1 all_dat <- mutate(all_dat,pct.diff=pct.1-pct.2) exp.out = stringr::str_glue('{auto_dir}/{sample}_type_marker_exp.tsv') write_tsv(all_dat, exp.out) @@ -109,16 +110,16 @@ for (cluster in clusters){ dev.off() png(paste0(png_dir,cluster,"_logfc.png"),width=1200,height=1000) - p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_logFC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2) + p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_log2FC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2) print (p2) dev.off() } # auto assign -exp[exp$type=="negative",]$avg_logFC = -(exp[exp$type=="negative",]$avg_logFC) +exp[exp$type=="negative",]$avg_log2FC = -(exp[exp$type=="negative",]$avg_log2FC) exp[exp$type=="negative",]$pct.diff = -(exp[exp$type=="negative",]$pct.diff) a <- group_by(exp,cluster,cell_type) -as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_logFC),max_p_val_adj=max(p_val_adj)) +as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_log2FC),max_p_val_adj=max(p_val_adj)) as1 <- group_by(ungroup(as),cluster) as1 <- mutate(as1,pct_rank = rank(avg_pct.diff), logfc_rank= rank(avg_logfc),total_rank=pct_rank+logfc_rank) diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index 5e0b5592..fe4e7415 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -1,6 +1,9 @@ -library(Seurat) +library(Seurat) # v4.0 library(tidyverse) library(argparser) +library(hdf5r) +library(rhdf5) + argv <- arg_parser('') argv <- add_argument(argv,"--matrix_file", help="matrix file") @@ -17,18 +20,32 @@ save_rds = argv$save_rds resolution = 0.6 res_str = paste0('res.', resolution) -matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "") tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv') marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv') mito.out = paste(outdir,"stat.txt",sep="/") rds.out = paste0(outdir,'/',sample,'.rds') +# read 10X +matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "") +rds = CreateSeuratObject(matrix, pro=sample) -rds = CreateSeuratObject(raw.data = matrix,project=sample) +# generate h5ad file +x = GetAssayData(rds,slot="count") +mtx = as.matrix(x) +barcode = colnames(rds) +geneid = rownames(rds) +h5.out = stringr::str_glue('{outdir}/{sample}.h5') +path <- path.expand(h5.out) +h5createFile(path) +h5f <- H5Fopen(path) +h5writeDataset(mtx,h5f,"X") +h5writeDataset(barcode,h5f,"obs") +h5writeDataset(geneid,h5f,"var") +H5Fclose(h5f) # mito -mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@data), value = TRUE, ignore.case=TRUE) -percent.mito <- Matrix::colSums(rds@raw.data[mito.genes,])/Matrix::colSums(rds@raw.data) +mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@assays$RNA@data), value = TRUE, ignore.case=TRUE) +percent.mito <- Matrix::colSums(rds@assays$RNA@counts[mito.genes,])/Matrix::colSums(rds@assays$RNA@counts) rds <- AddMetaData(object = rds, metadata = percent.mito, col.name = "percent.mito") meta = rds@meta.data total_cell = dim(meta)[1] @@ -43,44 +60,53 @@ mito_df$cell_percent = paste0(round(mito_df$cell_percent * 100,2),"%") mito_df$mito_percent = paste0("Fraction of cells have mito gene percent>",round(mito_df$mito_percent * 100,2),"%") write_delim(mito_df, mito.out, col_names=F, delim=":") -rds <- NormalizeData(object = rds, normalization.method = "LogNormalize",scale.factor = 10000) -rds <- FindVariableGenes(object = rds, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.1, y.cutoff = 1, do.contour=F) -use.gene = rds@var.genes -rds <- ScaleData(object = rds,vars.to.regress = c("nUMI", "percent.mito"),genes.use =use.gene) -rds <- RunPCA(object = rds, pc.genes = use.gene, do.print = FALSE) -rds <- FindClusters(object = rds, reduction.type = "pca", dims.use = 1:20, resolution = resolution, print.output = 0, save.SNN = TRUE,force.recalc = TRUE) -rds@meta.data[[res_str]] = as.numeric(rds@meta.data[[res_str]]) + 1 -rds = SetAllIdent(rds, res_str) - -# Run Non-linear dimensional reduction (tSNE) -rds <- RunTSNE(object = rds, dims.use = 1:20, do.fast = TRUE,check_duplicates = FALSE) + +rds <- NormalizeData(rds, normalization.method = "LogNormalize",scale.factor = 10000) +rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 2000, mean.cutoff = c(0.1, 8), dispersion.cutoff = c(1, Inf), + mean.function = ExpMean, dispersion.function = LogVMR) + +use.genes <- rds@assays$RNA@var.features +rds <- ScaleData(rds, vars.to.regress = c("nCount_RNA", "percent.mito"), features = use.genes) +rds <- RunPCA(object = rds, features = use.genes, do.print = FALSE) +rds <- FindNeighbors(rds, dims = 1:20, force.recalc = TRUE, reduction = "pca") +rds <- FindClusters(rds, resolution = resolution) + +# tsne and umap +rds <- RunTSNE(rds, dims = 1:20, do.fast = TRUE, check_duplicates = FALSE) + + tryCatch({ - rds.markers <- FindAllMarkers(object = rds, genes.use = use.gene) - rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_logFC)) + rds.markers <- FindAllMarkers(object = rds, features = use.genes) + rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_log2FC)) }, error = function(e){ print (paste0("no marker found: ", e)) rds.markers <<- data.frame(cluster=double(), - gene=double(), - avg_logFC=double(), - pct.1=double(), - pct.2=double(), - p_val_adj=double()) + gene=double(), + avg_log2FC=double(), + pct.1=double(), + pct.2=double(), + p_val_adj=double()) }) + +rds.markers$cluster = as.numeric(rds.markers$cluster) print (rds.markers) write_tsv(rds.markers,marker.out,col_names = T) -df.tsne = rds@dr$tsne@cell.embeddings + +df.tsne = rds@reductions$tsne@cell.embeddings df.tsne = as.data.frame(df.tsne) meta = rds@meta.data -dic = rds@meta.data[[res_str]] +dic = rds@meta.data[['seurat_clusters']] names(dic) = rownames(rds@meta.data) df.tsne$cluster = as.numeric(dic[rownames(df.tsne)]) -df.gene = meta[,"nGene",drop=F] +rds@meta.data$seurat_clusters = as.numeric(dic[rownames(df.tsne)]) +df.gene = meta[,"nFeature_RNA",drop=F] colnames(df.gene) = "Gene_Counts" df.all = cbind(df.tsne,df.gene) write.table(df.all,tsne.out,sep="\t",col.names=NA,quote = F) + if (save_rds == 'True'){ saveRDS(rds, rds.out) } \ No newline at end of file diff --git a/conda_pkgs.txt b/conda_pkgs.txt index f356aa68..3f032270 100644 --- a/conda_pkgs.txt +++ b/conda_pkgs.txt @@ -5,7 +5,7 @@ picard=2.18.17 ucsc-gtftogenepred=377 subread=2.0.1 samtools=1.9 -r-seurat=2.3.4 +r-seurat=4.0.1 r-argparser r-tidyverse mixcr=3.0.3 -- Gitee From d877b57d4e45ce25cdd6502dd2bb7503d72cf57e Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 10 May 2021 16:15:16 +0800 Subject: [PATCH 05/96] add .DS_store --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index 37518ec5..6fb58215 100755 --- a/.gitignore +++ b/.gitignore @@ -149,4 +149,7 @@ cython_debug/ #temp /temp/ +# .DS_store +.DS_store + -- Gitee From 5259474cbec7ea7b310cfa35f52b70eb936a2313 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 10 May 2021 16:15:32 +0800 Subject: [PATCH 06/96] rm h5py --- celescope/tools/count.py | 1 - 1 file changed, 1 deletion(-) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index 61d2545b..f3540ffa 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -12,7 +12,6 @@ import subprocess from scipy.io import mmwrite from scipy.sparse import csr_matrix, coo_matrix import pysam -import h5py from celescope.tools.utils import add_log, format_number, glob_genomeDir, gene_convert, s_common, add_mem from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3 from celescope.tools.__init__ import MATRIX_FILE_NAME, FEATURE_FILE_NAME, BARCODE_FILE_NAME -- Gitee From 7a5c11ca873541f93426b125ec667a47630b1be1 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 10 May 2021 16:21:46 +0800 Subject: [PATCH 07/96] merge --- celescope/tools/run_analysis.R | 3 --- 1 file changed, 3 deletions(-) diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index a761d16c..5883c06f 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -20,10 +20,7 @@ save_rds = argv$save_rds resolution = 0.6 res_str = paste0('res.', resolution) -<<<<<<< HEAD -======= matrix = Seurat::Read10X(matrix_dir, gene.column=2) ->>>>>>> 39b2447c47c4295f6fdbbe970dc21e2e43b5ee5b tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv') marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv') mito.out = paste(outdir,"stat.txt",sep="/") -- Gitee From 347163dd0f8610f2716fe981abc2fa4ca337b5c0 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 10 May 2021 16:25:23 +0800 Subject: [PATCH 08/96] merge --- celescope/tools/count.py | 296 --------------------------------------- 1 file changed, 296 deletions(-) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index ae203eac..4e7711b8 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -253,302 +253,6 @@ class Count(Step): df_sum = df_sum.sort_values(col, ascending=False) return df_sum -<<<<<<< HEAD - for u in umi_arr: - if float(_dict[umi_low]) / _dict[u] > percent: - break - if hd(umi_low, u) == 1: - _dict[u] += _dict[umi_low] - del (_dict[umi_low]) - break - res_dict[geneID] = _dict - return res_dict - - -@add_log -def bam2table(bam, detail_file): - # 提取bam中相同barcode的reads,统计比对到基因的reads信息 - # - samfile = pysam.AlignmentFile(bam, "rb") - with gzip.open(detail_file, 'wt') as fh1: - fh1.write('\t'.join(['Barcode', 'geneID', 'UMI', 'count']) + '\n') - - # pysam.libcalignedsegment.AlignedSegment - # AAACAGGCCAGCGTTAACACGACC_CCTAACGT_A00129:340:HHH72DSXX:2:1353:23276:30843 - # 获取read的barcode - def keyfunc(x): return x.query_name.split('_', 1)[0] - - for _, g in groupby(samfile, keyfunc): - gene_umi_dict = defaultdict(lambda: defaultdict(int)) - for seg in g: - (barcode, umi) = seg.query_name.split('_')[:2] - if not seg.has_tag('XT'): - continue - geneID = seg.get_tag('XT') - gene_umi_dict[geneID][umi] += 1 - res_dict = correct_umi(fh1, barcode, gene_umi_dict) - - # output - for geneID in res_dict: - for umi in res_dict[geneID]: - fh1.write('%s\t%s\t%s\t%s\n' % (barcode, geneID, umi, - res_dict[geneID][umi])) - samfile.close() - - -@add_log -def cell_calling(cell_calling_method, force_cell_num, expected_cell_num, all_matrix_10X_dir, df_sum, outdir, sample): - if (force_cell_num is not None) and (force_cell_num != 'None'): - cell_bc, UMI_threshold = force_cell(force_cell_num, df_sum) - elif cell_calling_method == 'auto': - cell_bc, UMI_threshold = auto_cell(df_sum, expected_cell_num) - elif cell_calling_method == 'cellranger3': - cell_bc, UMI_threshold = cellranger3_cell(all_matrix_10X_dir, expected_cell_num, df_sum) - elif cell_calling_method == 'inflection': - _cell_bc, UMI_threshold = auto_cell(df_sum, expected_cell_num) - cell_bc, UMI_threshold = inflection_cell(outdir, sample, all_matrix_10X_dir, df_sum, UMI_threshold) - cell_calling.logger.info(f'UMI_threshold: {UMI_threshold}') - return cell_bc, UMI_threshold - - -@add_log -def force_cell(force_cell_num, df_sum): - force_cell_num = int(force_cell_num) - cell_range = int(force_cell_num * 0.1) - cell_low = force_cell_num - cell_range - cell_high = force_cell_num + cell_range - - df_barcode_count = df_sum.groupby( - ['UMI']).size().reset_index( - name='barcode_counts') - sorted_df = df_barcode_count.sort_values("UMI", ascending=False) - sorted_df["barcode_cumsum"] = sorted_df["barcode_counts"].cumsum() - for i in range(sorted_df.shape[0]): - if sorted_df.iloc[i, :]["barcode_cumsum"] >= cell_low: - index_low = i - 1 - break - for i in range(sorted_df.shape[0]): - if sorted_df.iloc[i, :]["barcode_cumsum"] >= cell_high: - index_high = i - break - df_sub = sorted_df.iloc[index_low:index_high + 1, :] - threshold = df_sub.iloc[np.argmax( - np.diff(df_sub["barcode_cumsum"])), :]["UMI"] - cell_bc = get_cell_bc(df_sum, threshold, col='UMI') - - return cell_bc, threshold - - -def find_threshold(df_sum, idx): - return int(df_sum.iloc[idx - 1, df_sum.columns == 'UMI']) - - -@add_log -def auto_cell(df_sum, expected_cell_num): - col = "UMI" - idx = int(expected_cell_num * 0.01) - barcode_number = df_sum.shape[0] - idx = int(min(barcode_number, idx)) - if idx == 0: - sys.exit("cell number equals zero!") - # calculate read counts threshold - threshold = int(find_threshold(df_sum, idx) * 0.1) - threshold = max(1, threshold) - cell_bc = get_cell_bc(df_sum, threshold) - - return cell_bc, threshold - - -@add_log -def cellranger3_cell(all_matrix_10X_dir, expected_cell_num, df_sum): - cell_bc, initial_cell_num = cell_calling_3(all_matrix_10X_dir, expected_cell_num) - threshold = find_threshold(df_sum, initial_cell_num) - return cell_bc, threshold - - -@add_log -def inflection_cell(outdir, sample, all_matrix_10X_dir, df_sum, threshold): - app = f'{toolsdir}/rescue.R' - cmd = ( - f'Rscript {app} ' - f'--matrix_dir {all_matrix_10X_dir} ' - f'--outdir {outdir} ' - f'--sample {sample} ' - f'--threshold {threshold}' - ) - inflection_cell.logger.info(cmd) - subprocess.check_call(cmd, shell=True) - out_file = f'{outdir}/{sample}_rescue.tsv' - df = pd.read_csv(out_file, sep='\t') - inflection = int(df.loc[:,'inflection']) - threshold = inflection - cell_bc = get_cell_bc(df_sum, threshold) - - return cell_bc, threshold - - -@add_log -def get_df_sum(df, col='UMI'): - def num_gt2(x): - return pd.Series.sum(x[x > 1]) - - df_sum = df.groupby('Barcode').agg({ - 'count': ['sum', num_gt2], - 'UMI': 'count', - 'geneID': 'nunique' - }) - df_sum.columns = ['readcount', 'UMI2', 'UMI', 'geneID'] - df_sum = df_sum.sort_values(col, ascending=False) - return df_sum - -def get_cell_bc(df_sum, threshold, col='UMI'): - return list(df_sum[df_sum[col] >= threshold].index) - -@add_log -def plot_barcode_UMI(df_sum, threshold, expected_cell_num, cell_num, outdir, sample, cell_calling_method, col='UMI'): - out_plot = f'{outdir}/{sample}_barcode_UMI_plot.pdf' - import matplotlib - matplotlib.use('Agg') - import matplotlib.pyplot as plt - fig = plt.figure() - plt.plot(df_sum['UMI']) - plt.hlines(threshold, 0, cell_num, linestyle='dashed') - plt.vlines(cell_num, 0, threshold, linestyle='dashed') - plt.title('cell_calling_method: %s, expected_cell_num: %s\n %s threshold: %s, cell num: %s' % - (cell_calling_method, expected_cell_num, col, threshold, cell_num)) - plt.loglog() - plt.savefig(out_plot) - - -def get_cell_stats(df_sum, cell_bc, marked_counts_file): - df_sum.loc[:, 'mark'] = 'UB' - df_sum.loc[df_sum.index.isin(cell_bc), 'mark'] = 'CB' - df_sum.to_csv(marked_counts_file, sep='\t') - CB_describe = df_sum.loc[df_sum['mark'] == 'CB', :].describe() - - return CB_describe - - -def write_matrix_10X(table, id_name, matrix_10X_dir): - id = table.index.to_series() - name = id.apply(lambda x: id_name[x]) - genes = pd.concat([id, name], axis=1) - genes.columns = ['gene_id', 'gene_name'] - - #write - table.columns.to_series().to_csv( - f'{matrix_10X_dir}/barcodes.tsv', index=False, sep='\t') - genes.to_csv( - f'{matrix_10X_dir}/genes.tsv', index=False, header=False, sep='\t') - mmwrite(f'{matrix_10X_dir}/matrix', csr_matrix(table)) - return id, name - - -@add_log -def matrix_10X(df, outdir, sample, gtf_file, dir_name='matrix_10X', cell_bc=None): - matrix_10X_dir = f"{outdir}/{sample}_{dir_name}/" - if not os.path.exists(matrix_10X_dir): - os.mkdir(matrix_10X_dir) - id_name = gene_convert(gtf_file) - - if cell_bc is not None: - df = df.loc[df['Barcode'].isin(cell_bc), :] - - df_UMI = df.groupby(['geneID','Barcode']).agg({'UMI':'count'}) - mtx= coo_matrix((df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1]))) - id = df_UMI.index.levels[0].to_series() - # add gene symbol - name = id.apply(lambda x: id_name[x]) - genes = pd.concat([id, name], axis=1) - genes.columns = ['gene_id', 'gene_name'] - - barcodes = df_UMI.index.levels[1].to_series() - genes.to_csv(f'{matrix_10X_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False) - barcodes.to_csv(f'{matrix_10X_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t') - mmwrite(f'{matrix_10X_dir}/{MATRIX_FILE_NAME}', mtx) - - - return matrix_10X_dir - - -@add_log -def expression_matrix(df, cell_bc, outdir, sample, gtf_file): - - id_name = gene_convert(gtf_file) - - df.loc[:, 'mark'] = 'UB' - df.loc[df['Barcode'].isin(cell_bc), 'mark'] = 'CB' - CB_total_Genes = df.loc[df['mark'] == 'CB', 'geneID'].nunique() - CB_reads_count = df.loc[df['mark'] == 'CB', 'count'].sum() - reads_mapped_to_transcriptome = df['count'].sum() - - table = df.loc[df['mark'] == 'CB', :].pivot_table( - index='geneID', columns='Barcode', values='UMI', - aggfunc=len).fillna(0).astype(int) - - # convert id to name; write table matrix - matrix_table_file = f"{outdir}/{sample}_matrix.tsv.gz" - id = table.index.to_series() - name = id.apply(lambda x: id_name[x]) - table.index = name - table.index.name = "" - table.to_csv( - matrix_table_file, - sep="\t", - compression='gzip') - return(CB_total_Genes, CB_reads_count, reads_mapped_to_transcriptome) - - -def get_summary(df, sample, Saturation, CB_describe, CB_total_Genes, - CB_reads_count, reads_mapped_to_transcriptome, - stat_file, outdir): - - # total read - json_file = outdir + '.data.json' - fh = open(json_file) - data = json.load(fh) - str_number = data['barcode_summary'][1][1].split("(")[0] - valid_read_number = int(str_number.replace(",", "")) - - summary = pd.Series([0, 0, 0, 0, 0, 0, 0], - index=[ - 'Estimated Number of Cells', - 'Fraction Reads in Cells', - 'Mean Reads per Cell', - 'Median UMI per Cell', - 'Total Genes', - 'Median Genes per Cell', - 'Saturation', - ]) - - # 细胞数 - summary['Estimated Number of Cells'] = int( - CB_describe.loc['count', 'readcount']) - summary['Fraction Reads in Cells'] = '%.2f%%' % (float( - CB_reads_count) / reads_mapped_to_transcriptome * 100) - summary['Mean Reads per Cell'] = int( - valid_read_number / - summary['Estimated Number of Cells']) - summary['Median UMI per Cell'] = int(CB_describe.loc['50%', 'UMI']) - summary['Total Genes'] = int(CB_total_Genes) - summary['Median Genes per Cell'] = int(CB_describe.loc['50%', 'geneID']) - summary['Saturation'] = '%.2f%%' % (Saturation) - # 测序饱和度,认定为细胞中的reads中UMI>2的reads比例 - need_format = [ - 'Estimated Number of Cells', - 'Mean Reads per Cell', - 'Median UMI per Cell', - 'Total Genes', - 'Median Genes per Cell'] - for item in need_format: - summary[item] = format_number(summary[item]) - summary.to_csv(stat_file, header=False, sep=':') - - -@add_log -def sub_sample(fraction, df_cell, cell_bc, cell_read_index): -======= ->>>>>>> 39b2447c47c4295f6fdbbe970dc21e2e43b5ee5b ''' @utils.add_log def plot_barcode_UMI(df_sum, threshold, expected_cell_num, cell_num, outdir, sample, cell_calling_method, col='UMI'): -- Gitee From d11da15cef94e649d034d69a25a350b0bee98c45 Mon Sep 17 00:00:00 2001 From: seeuzhouxin Date: Mon, 10 May 2021 16:40:28 +0800 Subject: [PATCH 09/96] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20.D?= =?UTF-8?q?S=5FStore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .DS_Store | Bin 6148 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 .DS_Store diff --git a/.DS_Store b/.DS_Store deleted file mode 100644 index fde5083c0902b4b0b5ff087a8db22e54d5043b62..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeHK!EVz)5S>i}by5mBAkhnwFWjmqfeMKOQko`6k!Xc*1qVQ>otVbLwWHV}gbG!8usDUAJP!5 zWq6Zhs2N|qVHhN7t@ce6Hj0-nZ(3H_DsPuxID>fNq;5J+n(pWsUp;cufv4l+I1Ycu zuJvC^YRT;3){_JDDVdg@cLlG85|L_?uzjRHo2tO6VAvcmg+`~35NmSnDs z0!D%VN&!)9Ijts2(tGPlalF^s@Dn&2=PeXU3L3o~%Ye7y9k?`%*&F~H8W#%D12aDY NQU=o)1^%i6KLKwY+LZtR -- Gitee From 4d5869cd1b8bbdaa2ea949fd7bbd961d3ee7fa3b Mon Sep 17 00:00:00 2001 From: seeuzhouxin Date: Mon, 10 May 2021 16:40:36 +0800 Subject: [PATCH 10/96] =?UTF-8?q?=E5=88=A0=E9=99=A4=E6=96=87=E4=BB=B6=20ce?= =?UTF-8?q?lescope/.DS=5FStore?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- celescope/.DS_Store | Bin 8196 -> 0 bytes 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 celescope/.DS_Store diff --git a/celescope/.DS_Store b/celescope/.DS_Store deleted file mode 100644 index 78b57bb8670a1b98f6c056cf4c4e0cce4fd8f49b..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 8196 zcmeHML2nX46n+DgF4QWACUP;^i?^l$ZA^MFL9B<~OfY&-Ll;=fnq?Q4rKBQe@BRWk z_!s;g{sMoCC;i^cgfI&nycip2$jqDF{k}KzzL_$83lWL-bkHQK6On_;wzh|6L*eIK z*GfU(Ie-=569tq~7oQ^k4r0z9)0`fFdJ*zd8%!elLuBK=2>cBh;f` zvY^MTK6%LVQ<_pjK71tb;N@7~a;*I|)+I(w;A3Jk#ho1U2ADpDs1x*JjO@$z47quV zPdVN$(EMJ4c|N_%jq)>xF3|u{rTI7t7dc#+-te5^Ii|?;DV6%VqEwNFnD-`SOwHf1g4wrqe_dM`t-P*z9I7+=B z@)Jo820kFKUIft~p0(p?G?+*~9reI*Do&+atIy}n#_^$h+-xik-TBGA2Kx6OEEW~# z_TA>gv-8o_`}v2($In Date: Mon, 10 May 2021 17:50:50 +0800 Subject: [PATCH 11/96] fix --- celescope/tools/analysis.py | 10 +++++----- celescope/tools/run_analysis.R | 3 +-- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/celescope/tools/analysis.py b/celescope/tools/analysis.py index 5cfc474a..0de33bf2 100755 --- a/celescope/tools/analysis.py +++ b/celescope/tools/analysis.py @@ -35,10 +35,10 @@ def generate_matrix(gtf_file, matrix_file): @add_log -def seurat(sample, outdir, matrix_file, save_rds): +def seurat(sample, outdir, matrix_dir, save_rds): app = TOOLSDIR + "/run_analysis.R" cmd = ( - f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_file {matrix_file} ' + f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_dir {matrix_dir} ' f'--save_rds {save_rds}' ) seurat.logger.info(cmd) @@ -64,7 +64,7 @@ class Analysis_rna(Step, AnalysisMixin): def __init__(self, args, step_name): Step.__init__(self, args, step_name) AnalysisMixin.__init__(self, args) - self.matrix_file = args.matrix_file + self.matrix_dir = args.matrix_dir self.type_marker_tsv = args.type_marker_tsv self.auto_assign_bool = False self.save_rds = args.save_rds @@ -73,7 +73,7 @@ class Analysis_rna(Step, AnalysisMixin): self.save_rds = True def run(self): - seurat(self.sample, self.outdir, self.matrix_file, self.save_rds) + seurat(self.sample, self.outdir, self.matrix_dir, self.save_rds) if self.auto_assign_bool: auto_assign(self.sample, self.outdir, self.type_marker_tsv) self.run_analysis() @@ -95,7 +95,7 @@ def analysis(args): def get_opts_analysis(parser, sub_program): if sub_program: parser = s_common(parser) - parser.add_argument('--matrix_file', help='matrix file', required=True) + parser.add_argument('--matrix_dir', help='matrix dir', required=True) parser.add_argument('--save_rds', action='store_true', help='write rds to disk') parser.add_argument('--type_marker_tsv', help='cell type marker tsv') diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index 5883c06f..bc49c7fd 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -13,7 +13,7 @@ argv <- add_argument(argv,"--save_rds", help="write rds to disk") argv <- parse_args(argv) #args -matrix_file = argv$matrix_file +matrix_dir = argv$matrix_dir outdir = argv$outdir sample = argv$sample save_rds = argv$save_rds @@ -27,7 +27,6 @@ mito.out = paste(outdir,"stat.txt",sep="/") rds.out = paste0(outdir,'/',sample,'.rds') # read 10X -matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "") rds = CreateSeuratObject(matrix, pro=sample) # generate h5ad file -- Gitee From e91ee61875d65903d1b2e6a10186caf46029a329 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 11 May 2021 16:14:08 +0800 Subject: [PATCH 12/96] merge --- Dockerfile | 0 celescope/capture_virus/otsu.py | 0 celescope/capture_virus/test.py | 0 celescope/templates/css/buttons.dataTables.min.css | 0 celescope/templates/css/dataTables.jqueryui.min.css | 0 celescope/templates/css/jquery-ui-git.css | 0 celescope/templates/css/jquery-ui.css | 0 celescope/templates/html/common/consensus_summary.html | 0 .../templates/html/snp/target_metrics_summary.html | 0 celescope/templates/js/buttons.flash.min.js | 0 celescope/templates/js/buttons.html5.min.js | 0 celescope/templates/js/buttons.print.min.js | 0 celescope/templates/js/dataTables.buttons.min.js | 0 celescope/templates/js/dataTables.jqueryui.min.js | 0 celescope/templates/js/jquery.dataTables.min.js | 0 celescope/templates/js/jquery.min.3.3.1.js | 0 celescope/templates/js/jszip.min.js | 0 celescope/templates/js/plotly-1.58.4.min.js | 0 celescope/tests/__init__.py | 0 celescope/tests/func_tests.py | 0 celescope/tests/multi_tests.py | 0 celescope/tests/test_legacy.py | 0 celescope/tools/Step.py | 0 celescope/tools/analysis.py | 10 +++++----- celescope/tools/analysisMixin.py | 2 +- celescope/tools/cellranger3/__init__.py | 0 celescope/tools/cellranger3/cell_calling_3.py | 0 celescope/tools/cellranger3/get_plot_elements.py | 0 celescope/tools/cellranger3/sgt.py | 0 celescope/tools/cellranger3/stats.py | 0 celescope/tools/consensus.py | 0 celescope/tools/rescue.R | 0 celescope/tools/run_analysis.R | 6 +++--- celescope/tools/target_metrics.py | 0 conda_pkgs.txt | 0 docs/Multi-samples.md | 0 docs/STAR.md | 0 docs/analysis.md | 0 docs/barcode.md | 0 docs/consensus.md | 0 docs/count.md | 0 docs/count_vdj.md | 0 docs/cutadapt.md | 0 docs/featureCounts.md | 0 docs/mapping_vdj.md | 0 docs/v1.1.8.md | 0 docs/v1.1.9.md | 0 47 files changed, 9 insertions(+), 9 deletions(-) mode change 100644 => 100755 Dockerfile mode change 100644 => 100755 celescope/capture_virus/otsu.py mode change 100644 => 100755 celescope/capture_virus/test.py mode change 100644 => 100755 celescope/templates/css/buttons.dataTables.min.css mode change 100644 => 100755 celescope/templates/css/dataTables.jqueryui.min.css mode change 100644 => 100755 celescope/templates/css/jquery-ui-git.css mode change 100644 => 100755 celescope/templates/css/jquery-ui.css mode change 100644 => 100755 celescope/templates/html/common/consensus_summary.html mode change 100644 => 100755 celescope/templates/html/snp/target_metrics_summary.html mode change 100644 => 100755 celescope/templates/js/buttons.flash.min.js mode change 100644 => 100755 celescope/templates/js/buttons.html5.min.js mode change 100644 => 100755 celescope/templates/js/buttons.print.min.js mode change 100644 => 100755 celescope/templates/js/dataTables.buttons.min.js mode change 100644 => 100755 celescope/templates/js/dataTables.jqueryui.min.js mode change 100644 => 100755 celescope/templates/js/jquery.dataTables.min.js mode change 100644 => 100755 celescope/templates/js/jquery.min.3.3.1.js mode change 100644 => 100755 celescope/templates/js/jszip.min.js mode change 100644 => 100755 celescope/templates/js/plotly-1.58.4.min.js mode change 100644 => 100755 celescope/tests/__init__.py mode change 100644 => 100755 celescope/tests/func_tests.py mode change 100644 => 100755 celescope/tests/multi_tests.py mode change 100644 => 100755 celescope/tests/test_legacy.py mode change 100644 => 100755 celescope/tools/Step.py mode change 100644 => 100755 celescope/tools/analysisMixin.py mode change 100644 => 100755 celescope/tools/cellranger3/__init__.py mode change 100644 => 100755 celescope/tools/cellranger3/cell_calling_3.py mode change 100644 => 100755 celescope/tools/cellranger3/get_plot_elements.py mode change 100644 => 100755 celescope/tools/cellranger3/sgt.py mode change 100644 => 100755 celescope/tools/cellranger3/stats.py mode change 100644 => 100755 celescope/tools/consensus.py mode change 100644 => 100755 celescope/tools/rescue.R mode change 100644 => 100755 celescope/tools/target_metrics.py mode change 100644 => 100755 conda_pkgs.txt mode change 100644 => 100755 docs/Multi-samples.md mode change 100644 => 100755 docs/STAR.md mode change 100644 => 100755 docs/analysis.md mode change 100644 => 100755 docs/barcode.md mode change 100644 => 100755 docs/consensus.md mode change 100644 => 100755 docs/count.md mode change 100644 => 100755 docs/count_vdj.md mode change 100644 => 100755 docs/cutadapt.md mode change 100644 => 100755 docs/featureCounts.md mode change 100644 => 100755 docs/mapping_vdj.md mode change 100644 => 100755 docs/v1.1.8.md mode change 100644 => 100755 docs/v1.1.9.md diff --git a/Dockerfile b/Dockerfile old mode 100644 new mode 100755 diff --git a/celescope/capture_virus/otsu.py b/celescope/capture_virus/otsu.py old mode 100644 new mode 100755 diff --git a/celescope/capture_virus/test.py b/celescope/capture_virus/test.py old mode 100644 new mode 100755 diff --git a/celescope/templates/css/buttons.dataTables.min.css b/celescope/templates/css/buttons.dataTables.min.css old mode 100644 new mode 100755 diff --git a/celescope/templates/css/dataTables.jqueryui.min.css b/celescope/templates/css/dataTables.jqueryui.min.css old mode 100644 new mode 100755 diff --git a/celescope/templates/css/jquery-ui-git.css b/celescope/templates/css/jquery-ui-git.css old mode 100644 new mode 100755 diff --git a/celescope/templates/css/jquery-ui.css b/celescope/templates/css/jquery-ui.css old mode 100644 new mode 100755 diff --git a/celescope/templates/html/common/consensus_summary.html b/celescope/templates/html/common/consensus_summary.html old mode 100644 new mode 100755 diff --git a/celescope/templates/html/snp/target_metrics_summary.html b/celescope/templates/html/snp/target_metrics_summary.html old mode 100644 new mode 100755 diff --git a/celescope/templates/js/buttons.flash.min.js b/celescope/templates/js/buttons.flash.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/buttons.html5.min.js b/celescope/templates/js/buttons.html5.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/buttons.print.min.js b/celescope/templates/js/buttons.print.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/dataTables.buttons.min.js b/celescope/templates/js/dataTables.buttons.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/dataTables.jqueryui.min.js b/celescope/templates/js/dataTables.jqueryui.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/jquery.dataTables.min.js b/celescope/templates/js/jquery.dataTables.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/jquery.min.3.3.1.js b/celescope/templates/js/jquery.min.3.3.1.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/jszip.min.js b/celescope/templates/js/jszip.min.js old mode 100644 new mode 100755 diff --git a/celescope/templates/js/plotly-1.58.4.min.js b/celescope/templates/js/plotly-1.58.4.min.js old mode 100644 new mode 100755 diff --git a/celescope/tests/__init__.py b/celescope/tests/__init__.py old mode 100644 new mode 100755 diff --git a/celescope/tests/func_tests.py b/celescope/tests/func_tests.py old mode 100644 new mode 100755 diff --git a/celescope/tests/multi_tests.py b/celescope/tests/multi_tests.py old mode 100644 new mode 100755 diff --git a/celescope/tests/test_legacy.py b/celescope/tests/test_legacy.py old mode 100644 new mode 100755 diff --git a/celescope/tools/Step.py b/celescope/tools/Step.py old mode 100644 new mode 100755 diff --git a/celescope/tools/analysis.py b/celescope/tools/analysis.py index 0de33bf2..5cfc474a 100755 --- a/celescope/tools/analysis.py +++ b/celescope/tools/analysis.py @@ -35,10 +35,10 @@ def generate_matrix(gtf_file, matrix_file): @add_log -def seurat(sample, outdir, matrix_dir, save_rds): +def seurat(sample, outdir, matrix_file, save_rds): app = TOOLSDIR + "/run_analysis.R" cmd = ( - f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_dir {matrix_dir} ' + f'Rscript {app} --sample {sample} --outdir {outdir} --matrix_file {matrix_file} ' f'--save_rds {save_rds}' ) seurat.logger.info(cmd) @@ -64,7 +64,7 @@ class Analysis_rna(Step, AnalysisMixin): def __init__(self, args, step_name): Step.__init__(self, args, step_name) AnalysisMixin.__init__(self, args) - self.matrix_dir = args.matrix_dir + self.matrix_file = args.matrix_file self.type_marker_tsv = args.type_marker_tsv self.auto_assign_bool = False self.save_rds = args.save_rds @@ -73,7 +73,7 @@ class Analysis_rna(Step, AnalysisMixin): self.save_rds = True def run(self): - seurat(self.sample, self.outdir, self.matrix_dir, self.save_rds) + seurat(self.sample, self.outdir, self.matrix_file, self.save_rds) if self.auto_assign_bool: auto_assign(self.sample, self.outdir, self.type_marker_tsv) self.run_analysis() @@ -95,7 +95,7 @@ def analysis(args): def get_opts_analysis(parser, sub_program): if sub_program: parser = s_common(parser) - parser.add_argument('--matrix_dir', help='matrix dir', required=True) + parser.add_argument('--matrix_file', help='matrix file', required=True) parser.add_argument('--save_rds', action='store_true', help='write rds to disk') parser.add_argument('--type_marker_tsv', help='cell type marker tsv') diff --git a/celescope/tools/analysisMixin.py b/celescope/tools/analysisMixin.py old mode 100644 new mode 100755 index df2350d5..5e5e5dbc --- a/celescope/tools/analysisMixin.py +++ b/celescope/tools/analysisMixin.py @@ -50,7 +50,7 @@ class AnalysisMixin(): return html code """ marker_df = self.marker_df.loc[:, - ["cluster", "gene", "avg_logFC", "pct.1", "pct.2", "p_val_adj"] + ["cluster", "gene", "avg_log2FC", "pct.1", "pct.2", "p_val_adj"] ] marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}") diff --git a/celescope/tools/cellranger3/__init__.py b/celescope/tools/cellranger3/__init__.py old mode 100644 new mode 100755 diff --git a/celescope/tools/cellranger3/cell_calling_3.py b/celescope/tools/cellranger3/cell_calling_3.py old mode 100644 new mode 100755 diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py old mode 100644 new mode 100755 diff --git a/celescope/tools/cellranger3/sgt.py b/celescope/tools/cellranger3/sgt.py old mode 100644 new mode 100755 diff --git a/celescope/tools/cellranger3/stats.py b/celescope/tools/cellranger3/stats.py old mode 100644 new mode 100755 diff --git a/celescope/tools/consensus.py b/celescope/tools/consensus.py old mode 100644 new mode 100755 diff --git a/celescope/tools/rescue.R b/celescope/tools/rescue.R old mode 100644 new mode 100755 diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index bc49c7fd..2df168aa 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -6,21 +6,21 @@ library(rhdf5) argv <- arg_parser('') -argv <- add_argument(argv,"--matrix_dir", help="cell 10X matrix dir") +argv <- add_argument(argv,"--matrix_file", help="cell 10X matrix dir") argv <- add_argument(argv,"--outdir", help="outdir") argv <- add_argument(argv,"--sample", help="sample") argv <- add_argument(argv,"--save_rds", help="write rds to disk") argv <- parse_args(argv) #args -matrix_dir = argv$matrix_dir +matrix_file = argv$matrix_file outdir = argv$outdir sample = argv$sample save_rds = argv$save_rds resolution = 0.6 res_str = paste0('res.', resolution) -matrix = Seurat::Read10X(matrix_dir, gene.column=2) +matrix = Seurat::Read10X(matrix_file, gene.column=2) tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv') marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv') mito.out = paste(outdir,"stat.txt",sep="/") diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py old mode 100644 new mode 100755 diff --git a/conda_pkgs.txt b/conda_pkgs.txt old mode 100644 new mode 100755 diff --git a/docs/Multi-samples.md b/docs/Multi-samples.md old mode 100644 new mode 100755 diff --git a/docs/STAR.md b/docs/STAR.md old mode 100644 new mode 100755 diff --git a/docs/analysis.md b/docs/analysis.md old mode 100644 new mode 100755 diff --git a/docs/barcode.md b/docs/barcode.md old mode 100644 new mode 100755 diff --git a/docs/consensus.md b/docs/consensus.md old mode 100644 new mode 100755 diff --git a/docs/count.md b/docs/count.md old mode 100644 new mode 100755 diff --git a/docs/count_vdj.md b/docs/count_vdj.md old mode 100644 new mode 100755 diff --git a/docs/cutadapt.md b/docs/cutadapt.md old mode 100644 new mode 100755 diff --git a/docs/featureCounts.md b/docs/featureCounts.md old mode 100644 new mode 100755 diff --git a/docs/mapping_vdj.md b/docs/mapping_vdj.md old mode 100644 new mode 100755 diff --git a/docs/v1.1.8.md b/docs/v1.1.8.md old mode 100644 new mode 100755 diff --git a/docs/v1.1.9.md b/docs/v1.1.9.md old mode 100644 new mode 100755 -- Gitee From f6dfd7baca55bf81ad4a0ed1f66c188e592925be Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 11 May 2021 18:09:12 +0800 Subject: [PATCH 13/96] avg_logFC to avg_log2FC --- celescope/tools/Analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/celescope/tools/Analysis.py b/celescope/tools/Analysis.py index 21210743..ac4aabde 100755 --- a/celescope/tools/Analysis.py +++ b/celescope/tools/Analysis.py @@ -106,7 +106,7 @@ class Analysis(): return html code """ marker_df = self.marker_df.loc[:, ["cluster", "gene", - "avg_logFC", "pct.1", "pct.2", "p_val_adj"]] + "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]] marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}") return marker_df @@ -124,7 +124,7 @@ class Analysis(): return html code """ marker_df = self.marker_df.loc[:, ["cluster", "gene", - "avg_logFC", "pct.1", "pct.2", "p_val_adj"]] + "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]] marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}") marker_gene_table = marker_df.to_html( escape=False, -- Gitee From 9fe1f4632bcd19d0a3b6e0691ae03bf5ad001b92 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 11 May 2021 18:34:34 +0800 Subject: [PATCH 14/96] chang FindVariableFeatures parameters --- celescope/tools/run_analysis.R | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index 2df168aa..4a050c29 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -62,8 +62,7 @@ write_delim(mito_df, mito.out, col_names=F, delim=":") rds <- NormalizeData(rds, normalization.method = "LogNormalize",scale.factor = 10000) -rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 2000, mean.cutoff = c(0.1, 8), dispersion.cutoff = c(1, Inf), - mean.function = ExpMean, dispersion.function = LogVMR) +rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 10000) use.genes <- rds@assays$RNA@var.features rds <- ScaleData(rds, vars.to.regress = c("nCount_RNA", "percent.mito"), features = use.genes) -- Gitee From 5ddd153d90f4733901ecefb8dc01460e103b1051 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 17 May 2021 13:57:04 +0800 Subject: [PATCH 15/96] fix --- celescope/tools/run_analysis.R | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index 4a050c29..bb654e94 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -35,13 +35,16 @@ mtx = as.matrix(x) barcode = colnames(rds) geneid = rownames(rds) h5.out = stringr::str_glue('{outdir}/{sample}.h5') -path <- path.expand(h5.out) -h5createFile(path) -h5f <- H5Fopen(path) -h5writeDataset(mtx,h5f,"X") -h5writeDataset(barcode,h5f,"obs") -h5writeDataset(geneid,h5f,"var") -H5Fclose(h5f) +if (file.exists(h5.out) == FALSE){ + path <- path.expand(h5.out) + h5createFile(path) + h5f <- H5Fopen(path) + h5writeDataset(mtx,h5f,"X") + h5writeDataset(barcode,h5f,"obs") + h5writeDataset(geneid,h5f,"var") + H5Fclose(h5f) +} + # mito mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@assays$RNA@data), value = TRUE, ignore.case=TRUE) -- Gitee From 8aa87820def4eb6e2dca90fb348e7e35911410ae Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 27 May 2021 15:42:32 +0800 Subject: [PATCH 16/96] add tracer_vdj --- celescope/tracer_vdj/__init__.py | 7 ++ celescope/tracer_vdj/go_assemble.py | 140 +++++++++++++++++++++++ celescope/tracer_vdj/multi_tracer_vdj.py | 57 +++++++++ celescope/tracer_vdj/split_fastq.py | 125 ++++++++++++++++++++ 4 files changed, 329 insertions(+) create mode 100644 celescope/tracer_vdj/__init__.py create mode 100755 celescope/tracer_vdj/go_assemble.py create mode 100755 celescope/tracer_vdj/multi_tracer_vdj.py create mode 100755 celescope/tracer_vdj/split_fastq.py diff --git a/celescope/tracer_vdj/__init__.py b/celescope/tracer_vdj/__init__.py new file mode 100644 index 00000000..eb48cf9f --- /dev/null +++ b/celescope/tracer_vdj/__init__.py @@ -0,0 +1,7 @@ +__STEPS__ = [ + 'sample', + 'barcode', + 'cutadapt', + 'split_fastq', + 'go_assemble'] +__ASSAY__ = 'tracer_vdj' diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py new file mode 100755 index 00000000..65f4696a --- /dev/null +++ b/celescope/tracer_vdj/go_assemble.py @@ -0,0 +1,140 @@ +import argparse +import os +from os import listdir +from os.path import isfile, join +from concurrent.futures import ProcessPoolExecutor +from celescope.tools.utils import add_log +import datetime + + +TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer' +CONF_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer.conf' +BRACER_PATH = '/SGRNJ03/randd/zhouxin/software/bracer/bracer' +BRACER_CONDA = 'bracer' +BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' + + +# 开始组装 + + +def bracer_summarise(outdir): + bracer_outdir = f'{outdir}/bracer' + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'{BRACER_PATH} summarise ' + f'-c {BRACER_CONF} ' + f'--no_networks ' + f'{bracer_outdir} ' + ) + bracer_summarise.logger.info(cmd) + os.system(cmd) + + +def bracer(fq, outdir, species): + prefix = os.path.basename(fq).strip('.fq') + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'{BRACER_PATH} assemble ' + f'--fragment_length 150 ' + f'--fragment_sd 5 ' + f'--single_end ' + f'--small_index ' + f'--species {species} ' + f'-c {BRACER_CONF} ' + f'{prefix} ' + f'{outdir}/bracer ' + f'{fq} ' + ) + bracer.logger.info(cmd) + os.system(cmd) + + +def tracer_summarise(outdir): + tracer_outdir = f'{outdir}/tracer' + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'{TRACER_PATH} summarise ' + f'-c {CONF_PATH} ' + f'--no_networks ' + f'{tracer_outdir} ' + ) + tracer_summarise.logger.info(cmd) + os.system(cmd) + + +def tracer(fq, outdir, species): + prefix = os.path.basename(fq).strip('.fq') + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'{TRACER_PATH} assemble ' + f'--fragment_length 150 ' + f'--fragment_sd 5 ' + f'--single_end ' + f'--small_index ' + f'-m assembly ' + f'--species {species} ' + f'-c {CONF_PATH} ' + f'{fq} ' + f'{prefix} ' + f'{outdir}/tracer ' + ) + tracer.logger.info(cmd) + os.system(cmd) + + +@add_log +def run_tracer(outdir, fastq_dir, species, thread): + + fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] + outdirs = [outdir] * len(fqs) + species = [species] * len(fqs) + if not os.path.exists(f'{outdir}/tracer'): + os.makedirs(f'{outdir}/tracer') + + all_res = [] + with ProcessPoolExecutor(thread) as pool: + for res in pool.map(tracer, fqs, outdirs, species): + all_res.append(res) + + tracer_summarise(outdir) + + +@add_log +def run_bracer(outdir, fastq_dir, species, thread): + fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] + outdirs = [outdir] * len(fqs) + species = [species] * len(fqs) + if not os.path.exists(f'{outdir}/bracer'): + os.makedirs(f'{outdir}/bracer') + + all_res = [] + with ProcessPoolExecutor(thread) as pool: + for res in pool.map(bracer, fqs, outdirs, species): + all_res.append(res) + + bracer_summarise(outdir) + + +def go_assemble(args): + thread = int(args.thread) + fastq_dir = args.fastq_dir + outdir = args.outdir + species = args.species + + mode = args.mode + if mode == 'TCR': + run_tracer(outdir, fastq_dir, species, thread) + elif mode == 'BCR': + run_bracer(outdir, fastq_dir, species, thread) + + +def get_opts_go_assemble(parser, sub_program): + if sub_program: + parser.add_argument("--outdir", help="assemble outdir", required=True) + parser.add_argument("--sample", help="vdj sample name", required=True) + parser.add_argument('--assay', help='assay', required=True) + parser.add_argument('--fastq_dir', required=True) + parser.add_argument('--mode', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + parser.add_argument('--thread', help='thread', default=20) + diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py new file mode 100755 index 00000000..97beed69 --- /dev/null +++ b/celescope/tracer_vdj/multi_tracer_vdj.py @@ -0,0 +1,57 @@ +from celescope.tracer_vdj.__init__ import __STEPS__, __ASSAY__ +from celescope.tools.Multi import Multi + + +class Multi_tracer_vdj(Multi): + def custome_args(self): + self.parser.add_argument('--thread', help='thread', default=20) + self.parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR']) + self.parser.add_argument('--species', help='species name', choices=['Hsap', 'Mmus']) + + def read_custome_args(self): + self.thread = self.args.thread + self.mode = self.args.mode + self.species = self.args.species + + def split_fastq(self, sample): + step = 'split_fastq' + fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}' + cmd = ( + f'{self.__APP__} ' + f'{self.__ASSAY__} ' + f'{step} ' + f'--outdir {self.outdir_dic[sample][step]} ' + f'--sample {sample} ' + f'--assay {self.__ASSAY__} ' + f'--fq {fq} ' + f'--mode {self.mode} ' + f'--match_dir {self.col4_dict[sample]} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=1) + + + def go_assemble(self, sample): + step = 'go_assemble' + fastq_dir = f'{self.outdir_dic[sample]["split_fq"]}/fastq' + cmd = ( + f'{self.__APP__} ' + f'{self.__ASSAY__} ' + f'{step} ' + f'--outdir {self.outdir_dic[sample][step]} ' + f'--sample {sample} ' + f'--assay {self.__ASSAY__} ' + f'--fastq_dir {fastq_dir} ' + f'--mode {self.mode} ' + f'--species {self.species} ' + f'--thread {self.thread} ' + ) + self.process_cmd(cmd, step, sample, m=1.5 * int(self.args.thread), x=self.args.thread) + + +def main(): + multi = Multi_tracer_vdj(__ASSAY__) + multi.run() + +if __name__ == '__main__': + main() + diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py new file mode 100755 index 00000000..70ada28b --- /dev/null +++ b/celescope/tracer_vdj/split_fastq.py @@ -0,0 +1,125 @@ +import pysam +from collections import defaultdict +import os +import argparse +import datetime +import pandas as pd +from Bio.Seq import Seq +from glob import glob +from celescope.tools.utils import add_log + + +@add_log +def annotation_barcodes(match_dir, mode): + + cluster_data = glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')[0] + + cluster_type = pd.read_csv(cluster_data, sep='\t') + + # filter barcodes + if mode == 'TCR': + clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster']) + elif mode == 'BCR': + clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster']) + + tsne = glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')[0] + tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0) + + barcodes = [] + for cluster in clusters: + tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist() + barcodes += tmp + # write barcodes + barcodes_path = glob(f'{match_dir}/06.analysis/*_auto_assign/')[0] + + with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh: + for barcode in barcodes: + barcode = Seq(barcode) + barcode_reversed = barcode.reverse_complement() + bc = str(barcode_reversed) + fh.write(bc + '\n') + + with open(f'{barcodes_path}/reversed_barcodes.tsv') as res: + res = res.readlines() + return res + + +@add_log +def get_fastq_to_assemble(fq_outdir, fq, barcodes): + """ + split_fastq + """ + if not os.path.exists(fq_outdir): + os.makedirs(fq_outdir) + + barcode_reads_dict = defaultdict(list) # all barcodes from BCR vdj_dir paired with reads + reads_count_dict = {} # all barcodes and reads num for each barcode + all_barcodes = [] # all barcodes + with pysam.FastxFile(fq) as fq: + for entry in fq: + attr = entry.name.split('_') + barcode = attr[0] + all_barcodes.append(barcode) + barcode_reads_dict[barcode].append(entry) + for barcode in list(barcode_reads_dict.keys()): + reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) + + + barcodes_for_match = [] + for barcode in barcodes: + barcode = barcode.strip('\n') + barcodes_for_match.append(barcode) + barcodes_to_use = list(set(barcodes_for_match).intersection(set(all_barcodes))) + # barcodes in both RNA data and BCR data + + barcode_reads_useful = {barcode: barcode_reads_dict[barcode] for barcode in barcodes_to_use} + + + barcodes_reads_count = {barcode: reads_count_dict[barcode] for barcode in + list(barcode_reads_useful.keys())} + + barcodes_reads_cal = pd.DataFrame.from_dict(barcodes_reads_count, orient='index',columns=['counts']) + barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'}) + barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False) + + i = 1 + for barcode in list(barcode_reads_useful.keys()): + + with open(f'{fq_outdir}/{i}.fq', 'w') as f: + for entry in barcode_reads_useful[barcode]: + f.write(str(entry) + '\n') + if i % 100 == 0: + get_fastq_to_assemble.logger.info(f'processed {i} cells') + i += 1 + #stat file + barcodes_reads_cal.to_csv(f'{fq_outdir}/reads_count.tsv', sep='\t') + + stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful)) + with open(f'{fq_outdir}/stat.txt', 'w') as s: + s.write(stat_string) + + +def split_fastq(args): + mode = args.mode + match_dir = args.match_dir + sample = args.sample + outdir = args.outdir + assay = args.assay + fq = args.fq + + fq_outdir = f'{outdir}/fastq' + barcodes = annotation_barcodes(match_dir, mode) + + get_fastq_to_assemble(fq_outdir, fq, barcodes) + + +def get_opts_split_fastq(parser, sub_program): + if sub_program: + parser.add_argument('--sample',help='sample name', required=True) + parser.add_argument('--outdir', help='output dir', required=True) + parser.add_argument('--assay', help='assay', required=True) + parser.add_argument('--fq', required=True) + parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + parser.add_argument('--match_dir', help='matched rna_dir') + + -- Gitee From 47ad52acf58f732d4db731a6ebd5eef7d0cf009c Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 27 May 2021 15:43:11 +0800 Subject: [PATCH 17/96] add tracer_vdj --- celescope/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/celescope/__init__.py b/celescope/__init__.py index e87176ac..015f41d1 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -14,4 +14,5 @@ ASSAY_DICT = { 'tag': 'Single Cell tag', 'citeseq': 'Single Cell CITE-Seq', 'tcr_fl': 'Single Cell full length TCR', + 'tracer_vdj': 'Single Cell Full Length TCR or BCR' } -- Gitee From ef334a0d8e59a1409baf73113775aced62679d6b Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 28 May 2021 13:11:36 +0800 Subject: [PATCH 18/96] add tracer_vdj and fix bug --- celescope/templates/html/tracer_vdj/base.html | 161 ++++++++++++++++++ celescope/tools/barcode.py | 14 +- celescope/tracer_vdj/go_assemble.py | 28 ++- celescope/tracer_vdj/multi_tracer_vdj.py | 35 +--- celescope/tracer_vdj/split_fastq.py | 58 ++++--- 5 files changed, 226 insertions(+), 70 deletions(-) create mode 100755 celescope/templates/html/tracer_vdj/base.html diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html new file mode 100755 index 00000000..54ad567e --- /dev/null +++ b/celescope/templates/html/tracer_vdj/base.html @@ -0,0 +1,161 @@ + + + + + + report + + + + + + + + + + + + +
+

CeleScope Report

+ + {% if sample_summary is defined %} + {% include "html/common/sample_summary.html"%} + {% endif %} + + {% if barcode_summary is defined %} + {% include "html/common/barcode_summary.html"%} + {% endif %} + + {% if cutadapt_summary is defined %} + {% include "html/common/cutadapt_summary.html"%} + {% endif %} + + {% if consensus_summary is defined %} + {% include "html/common/consensus_summary.html"%} + {% endif %} + + {% if split_fastq is defined %} + {% include "html/tracer_vdj/split_fastq_summary.html"%} + {% endif %} + + {% if go_assemble_summary is defined %} + {% include "html/tracer_vdj/go_assemble_summary.html"%} + {% endif %} + + {% if table_dict is defined %} + {% include "html/vdj/clonetypes_table.html"%} + {% endif %} + + +
+ + + + \ No newline at end of file diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py index 3974a4de..1c48f6e3 100755 --- a/celescope/tools/barcode.py +++ b/celescope/tools/barcode.py @@ -225,12 +225,17 @@ class Barcode(Step): self.lowNum = args.lowNum self.lowQual = args.lowQual self.allowNoPolyT = args.allowNoPolyT - self.allowNoLinker = args.allowNoLinker + self.allowNoLinker = args.allowNoLinker + self.paired_fq = args.paired_fq + self.new_f1 = f'{self.outdir}/{self.sample}_new_R1.fq{suffix}' + self.new_f2 = f'{self.outdir}/{self.sample}_new_R2.fq{suffix}' @utils.add_log def run(self): fh3 = xopen(self.out_fq2, 'w') + new_f1 = xopen(self.new_f1, 'w') + new_f2 = xopen(self.new_f2, 'w') if self.nopolyT: fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w') @@ -377,6 +382,12 @@ class Barcode(Step): self.umi_qual_Counter.update(C_U_quals_ascii[C_len:]) fh3.write(f'@{cb}_{umi}_{self.total_num}\n{seq2}\n+\n{qual2}\n') + + if self.paired_fq: + + new_f1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n') + new_f2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n') + Barcode.run.logger.info(self.fq1_list[i] + ' finished.') fh3.close() @@ -487,6 +498,7 @@ def get_opts_barcode(parser, sub_program=True): parser.add_argument('--gzip', help="output gzipped fastq", action='store_true') parser.add_argument( '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto') + parser.add_argument('--paired_fq', help="output R1 R2", action='store_true') if sub_program: parser.add_argument('--fq1', help='read1 fq file', required=True) parser.add_argument('--fq2', help='read2 fq file', required=True) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 65f4696a..af5c622d 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -3,7 +3,8 @@ import os from os import listdir from os.path import isfile, join from concurrent.futures import ProcessPoolExecutor -from celescope.tools.utils import add_log +from celescope.tools import utils +from celescope.tools.utils import * import datetime @@ -16,7 +17,7 @@ BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' # 开始组装 - +@utils.add_log def bracer_summarise(outdir): bracer_outdir = f'{outdir}/bracer' cmd = ( @@ -29,7 +30,7 @@ def bracer_summarise(outdir): bracer_summarise.logger.info(cmd) os.system(cmd) - +@utils.add_log def bracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( @@ -48,7 +49,7 @@ def bracer(fq, outdir, species): bracer.logger.info(cmd) os.system(cmd) - +@utils.add_log def tracer_summarise(outdir): tracer_outdir = f'{outdir}/tracer' cmd = ( @@ -61,7 +62,7 @@ def tracer_summarise(outdir): tracer_summarise.logger.info(cmd) os.system(cmd) - +@utils.add_log def tracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( @@ -82,7 +83,7 @@ def tracer(fq, outdir, species): os.system(cmd) -@add_log +@utils.add_log def run_tracer(outdir, fastq_dir, species, thread): fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] @@ -99,7 +100,7 @@ def run_tracer(outdir, fastq_dir, species, thread): tracer_summarise(outdir) -@add_log +@utils.add_log def run_bracer(outdir, fastq_dir, species, thread): fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] outdirs = [outdir] * len(fqs) @@ -121,20 +122,17 @@ def go_assemble(args): outdir = args.outdir species = args.species - mode = args.mode - if mode == 'TCR': + type = args.type + if type == 'TCR': run_tracer(outdir, fastq_dir, species, thread) - elif mode == 'BCR': + elif type == 'BCR': run_bracer(outdir, fastq_dir, species, thread) def get_opts_go_assemble(parser, sub_program): if sub_program: - parser.add_argument("--outdir", help="assemble outdir", required=True) - parser.add_argument("--sample", help="vdj sample name", required=True) - parser.add_argument('--assay', help='assay', required=True) + parser = s_common(parser) parser.add_argument('--fastq_dir', required=True) - parser.add_argument('--mode', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--type', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) - parser.add_argument('--thread', help='thread', default=20) diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py index 97beed69..8efb09f4 100755 --- a/celescope/tracer_vdj/multi_tracer_vdj.py +++ b/celescope/tracer_vdj/multi_tracer_vdj.py @@ -3,49 +3,28 @@ from celescope.tools.Multi import Multi class Multi_tracer_vdj(Multi): - def custome_args(self): - self.parser.add_argument('--thread', help='thread', default=20) - self.parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR']) - self.parser.add_argument('--species', help='species name', choices=['Hsap', 'Mmus']) - - def read_custome_args(self): - self.thread = self.args.thread - self.mode = self.args.mode - self.species = self.args.species def split_fastq(self, sample): step = 'split_fastq' + cmd_line = self.get_cmd_line(step, sample) fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}' cmd = ( - f'{self.__APP__} ' - f'{self.__ASSAY__} ' - f'{step} ' - f'--outdir {self.outdir_dic[sample][step]} ' - f'--sample {sample} ' - f'--assay {self.__ASSAY__} ' + f'{cmd_line} ' f'--fq {fq} ' - f'--mode {self.mode} ' - f'--match_dir {self.col4_dict[sample]} ' + f'--match_dir {self.col4_dict[sample]}' ) self.process_cmd(cmd, step, sample, m=5, x=1) def go_assemble(self, sample): step = 'go_assemble' - fastq_dir = f'{self.outdir_dic[sample]["split_fq"]}/fastq' + cmd_line = self.get_cmd_line(step, sample) + fastq_dir = f'{self.outdir_dic[sample]["split_fastq"]}/fastq' cmd = ( - f'{self.__APP__} ' - f'{self.__ASSAY__} ' - f'{step} ' - f'--outdir {self.outdir_dic[sample][step]} ' - f'--sample {sample} ' - f'--assay {self.__ASSAY__} ' + f'{cmd_line} ' f'--fastq_dir {fastq_dir} ' - f'--mode {self.mode} ' - f'--species {self.species} ' - f'--thread {self.thread} ' ) - self.process_cmd(cmd, step, sample, m=1.5 * int(self.args.thread), x=self.args.thread) + self.process_cmd(cmd, step, sample, m=30, x=self.args.thread) def main(): diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index 70ada28b..9b14517b 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -5,24 +5,26 @@ import argparse import datetime import pandas as pd from Bio.Seq import Seq -from glob import glob -from celescope.tools.utils import add_log +import glob +from celescope.tools import utils +from celescope.tools.utils import * -@add_log -def annotation_barcodes(match_dir, mode): +@utils.add_log +def annotation_barcodes(match_dir, type): - cluster_data = glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv')[0] - + cluster_data = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv') + cluster_data = cluster_data[0] cluster_type = pd.read_csv(cluster_data, sep='\t') # filter barcodes - if mode == 'TCR': + if type == 'TCR': clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster']) - elif mode == 'BCR': + elif type == 'BCR': clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster']) - tsne = glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv')[0] + tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv') + tsne = tsne[0] tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0) barcodes = [] @@ -30,8 +32,8 @@ def annotation_barcodes(match_dir, mode): tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist() barcodes += tmp # write barcodes - barcodes_path = glob(f'{match_dir}/06.analysis/*_auto_assign/')[0] - + barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/') + barcodes_path = barcodes_path[0] with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh: for barcode in barcodes: barcode = Seq(barcode) @@ -44,7 +46,7 @@ def annotation_barcodes(match_dir, mode): return res -@add_log +@utils.add_log def get_fastq_to_assemble(fq_outdir, fq, barcodes): """ split_fastq @@ -82,25 +84,30 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'}) barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False) + barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') + + stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful)) + with open(f'{fq_outdir}/../stat.txt', 'w') as s: + s.write(stat_string) + i = 1 for barcode in list(barcode_reads_useful.keys()): with open(f'{fq_outdir}/{i}.fq', 'w') as f: for entry in barcode_reads_useful[barcode]: f.write(str(entry) + '\n') - if i % 100 == 0: + if i % 1000 == 0: get_fastq_to_assemble.logger.info(f'processed {i} cells') - i += 1 - #stat file - barcodes_reads_cal.to_csv(f'{fq_outdir}/reads_count.tsv', sep='\t') - stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful)) - with open(f'{fq_outdir}/stat.txt', 'w') as s: - s.write(stat_string) + if i == len(list(barcode_reads_useful.keys())): + get_fastq_to_assemble.loogger.info(f'finnaly get {i} cells') + + i += 1 + def split_fastq(args): - mode = args.mode + type = args.type match_dir = args.match_dir sample = args.sample outdir = args.outdir @@ -108,18 +115,17 @@ def split_fastq(args): fq = args.fq fq_outdir = f'{outdir}/fastq' - barcodes = annotation_barcodes(match_dir, mode) + barcodes = annotation_barcodes(match_dir, type) get_fastq_to_assemble(fq_outdir, fq, barcodes) def get_opts_split_fastq(parser, sub_program): if sub_program: - parser.add_argument('--sample',help='sample name', required=True) - parser.add_argument('--outdir', help='output dir', required=True) - parser.add_argument('--assay', help='assay', required=True) + parser = s_common(parser) parser.add_argument('--fq', required=True) - parser.add_argument('--mode', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) - parser.add_argument('--match_dir', help='matched rna_dir') + parser.add_argument('--match_dir', help='matched rna_dir') + parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + -- Gitee From ecfbaa7de7a73b09b82e6c3eafee977a8973f4e8 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 28 May 2021 19:20:27 +0800 Subject: [PATCH 19/96] add vdj_sum to filter results and summarise --- celescope/tracer_vdj/__init__.py | 3 +- celescope/tracer_vdj/go_assemble.py | 8 +- celescope/tracer_vdj/multi_tracer_vdj.py | 9 + celescope/tracer_vdj/split_fastq.py | 2 +- celescope/tracer_vdj/vdj_sum.py | 214 +++++++++++++++++++++++ 5 files changed, 230 insertions(+), 6 deletions(-) create mode 100644 celescope/tracer_vdj/vdj_sum.py diff --git a/celescope/tracer_vdj/__init__.py b/celescope/tracer_vdj/__init__.py index eb48cf9f..cdeab6e0 100644 --- a/celescope/tracer_vdj/__init__.py +++ b/celescope/tracer_vdj/__init__.py @@ -3,5 +3,6 @@ __STEPS__ = [ 'barcode', 'cutadapt', 'split_fastq', - 'go_assemble'] + 'go_assemble', + 'vdj_sum'] __ASSAY__ = 'tracer_vdj' diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index af5c622d..6f276d23 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -17,7 +17,7 @@ BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' # 开始组装 -@utils.add_log + def bracer_summarise(outdir): bracer_outdir = f'{outdir}/bracer' cmd = ( @@ -30,7 +30,7 @@ def bracer_summarise(outdir): bracer_summarise.logger.info(cmd) os.system(cmd) -@utils.add_log + def bracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( @@ -49,7 +49,7 @@ def bracer(fq, outdir, species): bracer.logger.info(cmd) os.system(cmd) -@utils.add_log + def tracer_summarise(outdir): tracer_outdir = f'{outdir}/tracer' cmd = ( @@ -62,7 +62,7 @@ def tracer_summarise(outdir): tracer_summarise.logger.info(cmd) os.system(cmd) -@utils.add_log + def tracer(fq, outdir, species): prefix = os.path.basename(fq).strip('.fq') cmd = ( diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py index 8efb09f4..173b1e52 100755 --- a/celescope/tracer_vdj/multi_tracer_vdj.py +++ b/celescope/tracer_vdj/multi_tracer_vdj.py @@ -26,6 +26,15 @@ class Multi_tracer_vdj(Multi): ) self.process_cmd(cmd, step, sample, m=30, x=self.args.thread) + def vdj_sum(self, sample): + step = 'vdj_sum' + cmd_line = self.get_cmd_line(step, sample) + ass_dir = f'{self.outdir_dic[sample]["go_assemble"]}' + cmd = ( + f'{cmd_line} ' + f'--ass_dir {ass_dir} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=2) def main(): multi = Multi_tracer_vdj(__ASSAY__) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index 9b14517b..024c295a 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -86,7 +86,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') - stat_string = 'All cells:{}\nmatched cell:{}'.format(len(all_barcodes), len(barcode_reads_useful)) + stat_string = 'All cells:{}\nmatched cell:{}\n'.format(len(all_barcodes), len(barcode_reads_useful)) with open(f'{fq_outdir}/../stat.txt', 'w') as s: s.write(stat_string) diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py new file mode 100644 index 00000000..162e5f0c --- /dev/null +++ b/celescope/tracer_vdj/vdj_sum.py @@ -0,0 +1,214 @@ +import pysam +from collections import defaultdict +import os +import argparse +import datetime +import pandas as pd +from Bio.Seq import Seq +import glob +from celescope.tools import utils +from celescope.tools.utils import * + + +def tpm_count(ass_dir): + rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recom' # ass_dir outdir/sample/04.go_assemble + f'binants.txt', sep='\t') + productive = rec[rec['productive'] == True] + productive['TPM'] = '' + indx = list(productive.index) + for i in indx: + cell_name = productive.at[i, 'cell_name'] + rec_id = productive.at[i, 'recombinant_id'] + with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf: + for line in tsvf: + if rec_id in line: + line = line.rstrip() + line = line.split('\t') + tpm = float(line[4]) + productive.loc[i, 'TPM'] = tpm + + return productive + + +def filtering(type, ass_dir, sum_dir): + + if not os.path.exists(sum_dir): + os.makedirs(sum_dir) + + if type == 'TCR': + data = tpm_count(ass_dir) + cell_name = set(list(data['cell_name'])) + filtered = pd.DataFrame() + for name in cell_name: + count_data = data[data['cell_name'] == name] + tra = count_data[count_data['locus'] == 'A'] + trb = count_data[count_data['locus'] == 'B'] + if tra.empty is not True: + tra = tra.sort_values(by='TPM', ascending=False) + tra = tra.head(1) + filtered = filtered.append(tra, ignore_index=True) + if trb.empty is not True: + trb = trb.sort_values(by='TPM', ascending=False) + trb = trb.head(1) + filtered = filtered.append(trb, ignore_index=True) + filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t') + + elif type == 'BCR': + + data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') + data = data[data['FUNCTIONAL'] == True] + cell_name = set(list(data['CELL'])) + filtered = pd.DataFrame() + for name in cell_name: + count_cell = data[data['CELL'] == name] + count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H']) + count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K']) + count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L']) + count_k_l = count_k.append(count_l) + if count_h.empty is not True: + count_h = count_h.sort_values(by='TPM', ascending=False) + count_h = count_h.head(1) + filtered = filtered.append(count_h, ignore_index=True) + if count_k_l.empty is not True: + count_k_l = count_k_l.sort_values(by='TPM', ascending=False) + count_k_l = count_k_l.head(1) + filtered = filtered.append(count_k_l, ignore_index=True) + + filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t') + + return filtered + + +def res_sum(type, ass_dir, sum_dir): + filtered = filtering(type, ass_dir, sum_dir) + + if type == 'TCR': + count_a = filtered[filtered['locus'] == 'A'].shape[0] + count_b = filtered[filtered['locus'] == 'B'].shape[0] + paired_cell = pd.DataFrame(filtered['cell_name'].value_counts()) + productive_cells = paired_cell.shape[0] + unpaired_cell = paired_cell[paired_cell['cell_name'] == 1] + paired_cell = paired_cell[paired_cell['cell_name'] == 2] + paired_cell = list(paired_cell.index) + string1 = f'productive TRA:\t{count_a}/{productive_cells}\nproductive TRB:\t{count_b}/{productive_cells}\npaired TRA and TRB:\t{len(paired_cell)}/{productive_cells}\n' + + with open(f'{sum_dir}/stat.txt', 'w') as fh: + fh.write(string1) + + aaseqs = [] + for cell in paired_cell: + temp = filtered[filtered['cell_name'] == cell] + temp_loci = list(temp['locus']) + temp_aaseq = list(temp['CDR3aa']) + string = 'TR{}:C{}F;TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0], temp_loci[1], temp_aaseq[1]) + aaseqs.append(string) + + for cell in list(unpaired_cell.index): + temp = filtered[filtered['cell_name'] == cell] + temp_loci = list(temp['locus']) + temp_aaseq = list(temp['CDR3aa']) + string = 'TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0]) + aaseqs.append(string) + + per_count_data = pd.DataFrame() + per_count_data['cdr3s_aa'] = aaseqs + clone_count = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts()) + clone_count.columns = ["frequency"] + proportation = [] + sum = clone_count['frequency'].sum() + for f in list(clone_count['frequency']): + p = f/sum + proportation.append(p) + clone_count['proportation'] = proportation + clone_count = clone_count.reset_index() + clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) + clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t') + + elif type == 'BCR': + filtered_h = filtered[filtered['LOCUS'] == 'H'] + filtered_k = filtered[filtered['LOCUS'] == 'K'] + filtered_l = filtered[filtered['LOCUS'] == 'L'] + filtered_h_count = filtered_h.shape[0] + filtered_k_count = filtered_k.shape[0] + filtered_l_count = filtered_l.shape[0] + + paired_cell = pd.DataFrame(filtered['CELL'].value_counts()) + productive_cells = paired_cell.shape[0] + + paired_cell = pd.DataFrame(filtered['CELL'].value_counts()) + productive_cells = paired_cell.shape[0] + unpaired_cell = paired_cell[paired_cell['CELL'] == 1] + paired_cell = paired_cell[paired_cell['CELL'] == 2] + paired_k = 0 + paired_l = 0 + + clones = pd.DataFrame() + cells = list(paired_cell.index) + aaseqs = [] + + for cell in cells: + if 'K' in list(filtered[filtered['CELL'] == cell]['LOCUS']): + paired_k += 1 + elif 'L' in list(filtered[filtered['CELL'] == cell]['LOCUS']): + paired_l += 1 + tep = filtered[filtered['CELL'] == cell] + tep_loci = list(tep['LOCUS']) + cdr3 = list(tep['JUNCTION']) + aaseq = [] + for seq in cdr3: + seq = Seq(seq) + seq = seq.translate() + aaseq.append(seq) + string = 'IG{}:{};IG{}:{}'.format(tep_loci[0], aaseq[0], tep_loci[1], aaseq[1]) + aaseqs.append(string) + + for cell in list(unpaired_cell.index): + cells.append(cell) + locus = list(filtered[filtered['CELL'] == cell]['LOCUS']) + cdr3 = list(filtered[filtered['CELL'] == cell]['JUNCTION']) + seq = Seq(cdr3[0]) + seq = seq.translate() + string = 'IG{}:{}'.format(locus[0], seq) + aaseqs.append(string) + + clones['CELLS'] = cells + + clones["cdr3s_aa"] = aaseqs + clone_count = pd.DataFrame(clones['cdr3s_aa'].value_counts()) + clone_count.columns = ["frequency"] + proportation = [] + sum = clone_count['frequency'].sum() + for f in list(clone_count['frequency']): + p = f/sum + proportation.append(p) + clone_count['proportation'] = proportation + clone_count = clone_count.reset_index() + clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) + clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t') + + stat_string_1 = f"BCR_H reconstruction:\t{filtered_h_count}/{productive_cells}\nBCR_K reconstruction:\t{filtered_k_count}/{productive_cells}\nBCR_L reconstruction:\t{filtered_l_count}/{productive_cells}\n" + + stat_string_2 = "Paired HK productive reconstruction:\t{}/{}\nPaired HL productive reconstruction:\t{}/{}\n".format(paired_k, productive_cells, paired_l, productive_cells) + + with open(f'{sum_dir}/stat.txt', 'w') as s: + s.write(stat_string_1) + s.write(stat_string_2) + +@utils.add_log +def vdj_sum(args): + type = args.type + ass_dir = args.ass_dir + sample = args.sample + outdir = args.outdir + + res_sum(type, ass_dir, outdir) + + +def get_opts_vdj_sum(parser, sub_program): + if sub_program: + parser = s_common(parser) + parser.add_argument('--ass_dir', help='assemble dir', required=True) + parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + + + -- Gitee From f5ce79bbdbffb567e23eada0ea7e5bb666f9eb98 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 1 Jun 2021 14:40:37 +0800 Subject: [PATCH 20/96] updata stat --- celescope/tracer_vdj/split_fastq.py | 12 ++++++++++-- celescope/tracer_vdj/vdj_sum.py | 2 +- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index 024c295a..0c5a0223 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -55,17 +55,25 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): os.makedirs(fq_outdir) barcode_reads_dict = defaultdict(list) # all barcodes from BCR vdj_dir paired with reads + umi_count = defaultdict() reads_count_dict = {} # all barcodes and reads num for each barcode all_barcodes = [] # all barcodes with pysam.FastxFile(fq) as fq: for entry in fq: attr = entry.name.split('_') barcode = attr[0] + umi = attr[1] + umi_count[barcode][umi] += 1 all_barcodes.append(barcode) barcode_reads_dict[barcode].append(entry) for barcode in list(barcode_reads_dict.keys()): reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) + + umi_count_df = pd.DataFrame([(k, list(v.keys())[0], list(v.values())[0]) for k, v in umi_count.items()], columns=['Barcode', 'umi', 'umi_reads_count']) + + umi_df = umi_count_df.groupby(['Barcode']).agg({'UMI': 'count'}) + umi_df.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') barcodes_for_match = [] for barcode in barcodes: @@ -86,7 +94,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') - stat_string = 'All cells:{}\nmatched cell:{}\n'.format(len(all_barcodes), len(barcode_reads_useful)) + stat_string = 'All_cells:\t{}\nmatched_cell:\t{}\n'.format(len(all_barcodes), len(barcode_reads_useful)) with open(f'{fq_outdir}/../stat.txt', 'w') as s: s.write(stat_string) @@ -100,7 +108,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): get_fastq_to_assemble.logger.info(f'processed {i} cells') if i == len(list(barcode_reads_useful.keys())): - get_fastq_to_assemble.loogger.info(f'finnaly get {i} cells') + get_fastq_to_assemble.logger.info(f'finnaly get {i} cells') i += 1 diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 162e5f0c..f2020e0a 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -90,7 +90,7 @@ def res_sum(type, ass_dir, sum_dir): unpaired_cell = paired_cell[paired_cell['cell_name'] == 1] paired_cell = paired_cell[paired_cell['cell_name'] == 2] paired_cell = list(paired_cell.index) - string1 = f'productive TRA:\t{count_a}/{productive_cells}\nproductive TRB:\t{count_b}/{productive_cells}\npaired TRA and TRB:\t{len(paired_cell)}/{productive_cells}\n' + string1 = f'productive_TRA:\t{count_a}/{productive_cells}\nproductive_TRB:\t{count_b}/{productive_cells}\npaired_TRA_and_TRB:\t{len(paired_cell)}/{productive_cells}\n' with open(f'{sum_dir}/stat.txt', 'w') as fh: fh.write(string1) -- Gitee From 3119aec0399e8f90f008b45044e98dc35cf7b53a Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Wed, 2 Jun 2021 19:14:37 +0800 Subject: [PATCH 21/96] update stat.txt and rewrite split_fq --- celescope/tracer_vdj/go_assemble.py | 112 ++++++++++++++++++++++- celescope/tracer_vdj/multi_tracer_vdj.py | 4 + celescope/tracer_vdj/split_fastq.py | 67 ++++++-------- celescope/tracer_vdj/vdj_sum.py | 68 +++++++++----- 4 files changed, 188 insertions(+), 63 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 6f276d23..fa4b70fd 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -6,6 +6,8 @@ from concurrent.futures import ProcessPoolExecutor from celescope.tools import utils from celescope.tools.utils import * import datetime +import glob +import pysam TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer' @@ -27,7 +29,6 @@ def bracer_summarise(outdir): f'--no_networks ' f'{bracer_outdir} ' ) - bracer_summarise.logger.info(cmd) os.system(cmd) @@ -40,13 +41,13 @@ def bracer(fq, outdir, species): f'--fragment_sd 5 ' f'--single_end ' f'--small_index ' + f'--no_trimming ' f'--species {species} ' f'-c {BRACER_CONF} ' f'{prefix} ' f'{outdir}/bracer ' f'{fq} ' ) - bracer.logger.info(cmd) os.system(cmd) @@ -59,7 +60,6 @@ def tracer_summarise(outdir): f'--no_networks ' f'{tracer_outdir} ' ) - tracer_summarise.logger.info(cmd) os.system(cmd) @@ -79,7 +79,6 @@ def tracer(fq, outdir, species): f'{prefix} ' f'{outdir}/tracer ' ) - tracer.logger.info(cmd) os.system(cmd) @@ -116,6 +115,111 @@ def run_bracer(outdir, fastq_dir, species, thread): bracer_summarise(outdir) +################def get_reads_count(fq): +# with pysam.FastxFile(fq) as fh: +# count = 0 +# for entry in fh: +# count += 1 +# return count + + +def get_umi_count(fq): + umis = [] + with pysam.FastxFile(fq) as fh: + for entry in fh: + attr = entry.name.split('_') + barcode = attr[0] + umi = attr[1] + umis.append(umi) + res = len(set(umis)) + return res + + +def get_assemble_stat(outdir, type): + + total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv' + UMIs = pd.DataFrame(total_fq, sep='\t') + + all_UMIs = UMIs['UMIs_count'].sum() + stat_file = outdir + '/../04.go_assemble/stat.txt' + + if type == 'TCR': + TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') + TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') + TRA_UMIs = [get_umi_count(fq) for fq in TRAs] + TRB_UMIs = [get_umi_count(fq) for fq in TRBs] + TRA_UMIs_count = sum(TRA_UMIs) + TRA_ = format(TRA_UMIs_count, ',') + TRB_UMIs_count = sum(TRB_UMIs) + TRB_ = format(TRB_UMIs_count, ',') + + TRA_mapping = TRA_UMIs_count/all_UMIs + TRA_mapping = round(TRA_mapping, 4) + TRA_mapping = f'{TRA_}({TRA_mapping})' + + TRB_mapping = TRB_UMIs_count/all_UMIs + TRB_mapping = round(TRB_mapping, 4) + TRB_mapping = f'{TRB_}({TRB_mapping})' + + total_counts = TRA_UMIs_count + TRB_UMIs_count + total_ = format(total_counts, ',') + total_mapping = (total_counts)/all_UMIs + total_mapping = round(total_mapping, 4) + total_mapping = f'{total_}({total_mapping})' + + stat_text = pd.DataFrame({ + 'item': ['UMIs mapped to TRA or TRB', 'UMIs mapped to TRA', 'UMIs mapped to TRB'], 'count': [total_mapping, TRA_mapping, TRB_mapping] + }, columns=['item', 'count']) + + stat_text.to_csv(stat_file, sep=':', header=None, index=False) + + elif type == 'BCR': + IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') + IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') + IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') + + IGH_UMIs = [get_umi_count(fq) for fq in IGHs] + IGK_UMIs = [get_umi_count(fq) for fq in IGKs] + IGL_UMIs = [get_umi_count(fq) for fq in IGLs] + + + IGH = sum(IGH_UMIs) + IGH_ = format(IGH, ',') + IGK = sum(IGK_UMIs) + IGK_ = format(IGK, ',') + IGL = sum(IGL_UMIs) + IGL_ = format(IGL, ',') + + IGH_mapping = IGH/all_UMIs + IGH_mapping = round(IGH_mapping, 4) + IGH_mapping = f'{IGH_}({IGH_mapping})' + + IGK_mapping = IGK/all_UMIs + IGK_mapping = round(IGK_mapping, 4) + IGK_mapping = f'{IGK_}({IGK_mapping})' + + IGL_mapping = IGL/all_UMIs + IGL_mapping = round(IGL_mapping, 4) + IGL_mapping = f'{IGL_}({IGL_mapping})' + + total_counts = IGH + IGK + IGL + total_ = format(total_counts, ',') + + total_mapping = (total_counts)/all_UMIs + total_mapping = round(total_mapping, 4) + total_mapping = f'{total_}({total_mapping})' + + stat_text = pd.DataFrame({ + 'item': ['UMIs mapped to IGH, IGK or IGL', 'UMIs mapped to IGH', 'UMIs mapped to IGK', 'UMIs mapped to IGL'], 'count': [total_mapping, IGH_mapping, IGK_mapping, IGL_mapping] + }) + + stat_text.to_csv(stat_file, sep=':', header=None, index=False) + + + + + + def go_assemble(args): thread = int(args.thread) fastq_dir = args.fastq_dir diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py index 173b1e52..18ee9f98 100755 --- a/celescope/tracer_vdj/multi_tracer_vdj.py +++ b/celescope/tracer_vdj/multi_tracer_vdj.py @@ -30,9 +30,13 @@ class Multi_tracer_vdj(Multi): step = 'vdj_sum' cmd_line = self.get_cmd_line(step, sample) ass_dir = f'{self.outdir_dic[sample]["go_assemble"]}' + + fastq_dir = f'{self.outdir_dic[sample]["split_fastq"]}/fastq' + cmd = ( f'{cmd_line} ' f'--ass_dir {ass_dir} ' + f'--fastq_dir {fastq_dir} ' ) self.process_cmd(cmd, step, sample, m=5, x=2) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index 0c5a0223..699be4c5 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -7,7 +7,7 @@ import pandas as pd from Bio.Seq import Seq import glob from celescope.tools import utils -from celescope.tools.utils import * +from celescope.tools.Step import Step, s_common @utils.add_log @@ -33,16 +33,17 @@ def annotation_barcodes(match_dir, type): barcodes += tmp # write barcodes barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/') - barcodes_path = barcodes_path[0] + barcodes_path = barcodes_path[0] + + res = [] with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh: for barcode in barcodes: barcode = Seq(barcode) barcode_reversed = barcode.reverse_complement() bc = str(barcode_reversed) + res.append(bc) fh.write(bc + '\n') - with open(f'{barcodes_path}/reversed_barcodes.tsv') as res: - res = res.readlines() return res @@ -55,62 +56,54 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): os.makedirs(fq_outdir) barcode_reads_dict = defaultdict(list) # all barcodes from BCR vdj_dir paired with reads - umi_count = defaultdict() + # umi_count = defaultdict(list) reads_count_dict = {} # all barcodes and reads num for each barcode - all_barcodes = [] # all barcodes + umi_count_dict = defaultdict(list) + umi_count = {} with pysam.FastxFile(fq) as fq: for entry in fq: attr = entry.name.split('_') barcode = attr[0] umi = attr[1] - umi_count[barcode][umi] += 1 - all_barcodes.append(barcode) - barcode_reads_dict[barcode].append(entry) - for barcode in list(barcode_reads_dict.keys()): + if barcode in barcodes: + barcode_reads_dict[barcode].append(entry) + if umi_count_dict[barcode].count(umi) == 0: + umi_count_dict[barcode].append(umi) + for barcode in barcodes: reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) - - umi_count_df = pd.DataFrame([(k, list(v.keys())[0], list(v.values())[0]) for k, v in umi_count.items()], columns=['Barcode', 'umi', 'umi_reads_count']) - umi_df = umi_count_df.groupby(['Barcode']).agg({'UMI': 'count'}) - - umi_df.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') - - barcodes_for_match = [] - for barcode in barcodes: - barcode = barcode.strip('\n') - barcodes_for_match.append(barcode) - barcodes_to_use = list(set(barcodes_for_match).intersection(set(all_barcodes))) - # barcodes in both RNA data and BCR data + umi_count[barcode] = len(umi_count_dict[barcode]) - barcode_reads_useful = {barcode: barcode_reads_dict[barcode] for barcode in barcodes_to_use} + df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMIs_count']) + df_umi = df_umi.reset_index().rename(columns={'index': 'barcode'}) + reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['reads_count']) + reads_count = reads_count.reset_index().rename(columns={'index': 'barcode'}) - barcodes_reads_count = {barcode: reads_count_dict[barcode] for barcode in - list(barcode_reads_useful.keys())} + df_f = pd.merge(reads_count, df_umi, on='barcode', how='inner') - barcodes_reads_cal = pd.DataFrame.from_dict(barcodes_reads_count, orient='index',columns=['counts']) - barcodes_reads_cal = barcodes_reads_cal.reset_index().rename(columns={'index': 'barcode'}) - barcodes_reads_cal = barcodes_reads_cal.sort_values(by='counts', ascending=False) + df_f = df_f.set_index('barcode') - barcodes_reads_cal.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') + i = 1 - stat_string = 'All_cells:\t{}\nmatched_cell:\t{}\n'.format(len(all_barcodes), len(barcode_reads_useful)) - with open(f'{fq_outdir}/../stat.txt', 'w') as s: - s.write(stat_string) + for barcode in barcodes: - i = 1 - for barcode in list(barcode_reads_useful.keys()): + df_f.loc[barcode, 'cell_name'] = i with open(f'{fq_outdir}/{i}.fq', 'w') as f: - for entry in barcode_reads_useful[barcode]: + for entry in barcode_reads_dict[barcode]: f.write(str(entry) + '\n') + if i % 1000 == 0: get_fastq_to_assemble.logger.info(f'processed {i} cells') - if i == len(list(barcode_reads_useful.keys())): - get_fastq_to_assemble.logger.info(f'finnaly get {i} cells') + if i == len(barcodes): + get_fastq_to_assemble.logger.info(f'finally get {i} cells') i += 1 + + df_f = df_f.astype(int) + df_f.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index f2020e0a..803e97ab 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -7,7 +7,9 @@ import pandas as pd from Bio.Seq import Seq import glob from celescope.tools import utils -from celescope.tools.utils import * +from celescope.tools.Step import Step, s_common +import glob + def tpm_count(ass_dir): @@ -30,10 +32,10 @@ def tpm_count(ass_dir): return productive -def filtering(type, ass_dir, sum_dir): +def filtering(type, ass_dir, outdir): - if not os.path.exists(sum_dir): - os.makedirs(sum_dir) + if not os.path.exists(outdir): + os.makedirs(outdir) if type == 'TCR': data = tpm_count(ass_dir) @@ -51,7 +53,7 @@ def filtering(type, ass_dir, sum_dir): trb = trb.sort_values(by='TPM', ascending=False) trb = trb.head(1) filtered = filtered.append(trb, ignore_index=True) - filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t') + filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') elif type == 'BCR': @@ -74,13 +76,13 @@ def filtering(type, ass_dir, sum_dir): count_k_l = count_k_l.head(1) filtered = filtered.append(count_k_l, ignore_index=True) - filtered.to_csv(f'{sum_dir}/filtered.txt', sep='\t') + filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') return filtered -def res_sum(type, ass_dir, sum_dir): - filtered = filtering(type, ass_dir, sum_dir) +def res_sum(type, ass_dir, outdir): + filtered = filtering(type, ass_dir, outdir) if type == 'TCR': count_a = filtered[filtered['locus'] == 'A'].shape[0] @@ -90,10 +92,6 @@ def res_sum(type, ass_dir, sum_dir): unpaired_cell = paired_cell[paired_cell['cell_name'] == 1] paired_cell = paired_cell[paired_cell['cell_name'] == 2] paired_cell = list(paired_cell.index) - string1 = f'productive_TRA:\t{count_a}/{productive_cells}\nproductive_TRB:\t{count_b}/{productive_cells}\npaired_TRA_and_TRB:\t{len(paired_cell)}/{productive_cells}\n' - - with open(f'{sum_dir}/stat.txt', 'w') as fh: - fh.write(string1) aaseqs = [] for cell in paired_cell: @@ -122,7 +120,9 @@ def res_sum(type, ass_dir, sum_dir): clone_count['proportation'] = proportation clone_count = clone_count.reset_index() clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t') + clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t') + + return productive_cells, count_a, count_b, paired_cell elif type == 'BCR': filtered_h = filtered[filtered['LOCUS'] == 'H'] @@ -184,15 +184,37 @@ def res_sum(type, ass_dir, sum_dir): clone_count['proportation'] = proportation clone_count = clone_count.reset_index() clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clone_count.to_csv(f'{sum_dir}/clone_count.tsv', sep='\t') + clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t') - stat_string_1 = f"BCR_H reconstruction:\t{filtered_h_count}/{productive_cells}\nBCR_K reconstruction:\t{filtered_k_count}/{productive_cells}\nBCR_L reconstruction:\t{filtered_l_count}/{productive_cells}\n" - - stat_string_2 = "Paired HK productive reconstruction:\t{}/{}\nPaired HL productive reconstruction:\t{}/{}\n".format(paired_k, productive_cells, paired_l, productive_cells) + return productive_cells, filtered_h_count, filtered_k_count, filtered_l_count, paired_k, paired_l + + + +def get_stat(fastq_dir, ass_dir, outdir, type): + fqs = glob.glob(f'{fastq_dir}/*.fq') + matched_bcs = len(fqs) + + stat_file = outdir + '/stat.txt' + if type == 'TCR': + productive_cells, TRA_num, TRB_num, paired_num = res_sum(type, ass_dir, outdir) + + stat_text = pd.DataFrame({ + 'item': ['Matched cells', 'Productive cells', 'Cells with TRA', 'Cells with TRB', 'Cells with paired TRA and TRB'], + 'count': [matched_bcs, productive_cells, TRA_num, TRB_num, paired_num] + }, + columns=['item', 'count']) + stat_text.to_csv(stat_file, sep=':', header=None, index=False) + + elif type == 'BCR': + productive_cells, H_num, K_num, L_num, H_K_num, H_L_num = res_sum(type, ass_dir,outdir) + + stat_text = pd.DataFrame({ + 'item': ['Matched cells', 'Productive cells', 'Cells with IGH', 'Cells with IGK', 'Cells with IGL', 'Cells with IGH and IGK', 'Cells with IGH and IGL'], + 'count': [matched_bcs, productive_cells, H_num, K_num, L_num, H_K_num, H_L_num] + }, + columns=['item', 'count']) + stat_text.to_csv(stat_file, sep=":", header=None, index=False) - with open(f'{sum_dir}/stat.txt', 'w') as s: - s.write(stat_string_1) - s.write(stat_string_2) @utils.add_log def vdj_sum(args): @@ -200,14 +222,16 @@ def vdj_sum(args): ass_dir = args.ass_dir sample = args.sample outdir = args.outdir - - res_sum(type, ass_dir, outdir) + fastq_dir = args.fastq_dir + + get_stat(fastq_dir, ass_dir, outdir, type) def get_opts_vdj_sum(parser, sub_program): if sub_program: parser = s_common(parser) parser.add_argument('--ass_dir', help='assemble dir', required=True) + parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) -- Gitee From ade04c76c41c86a99e9efd3caca3f680ab07d88e Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 3 Jun 2021 16:59:38 +0800 Subject: [PATCH 22/96] generate stat --- celescope/tracer_vdj/go_assemble.py | 104 +++++----- celescope/tracer_vdj/vdj_sum.py | 288 ++++++++++++++++++++++------ 2 files changed, 290 insertions(+), 102 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index fa4b70fd..cc8912a3 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -8,6 +8,8 @@ from celescope.tools.utils import * import datetime import glob import pysam +import numpy as np +from celescope.tools.Step import Step, s_common TRACER_PATH = '/SGRNJ03/randd/zhouxin/software/tracer/tracer' @@ -135,43 +137,46 @@ def get_umi_count(fq): return res -def get_assemble_stat(outdir, type): +def go_assemble_summary(outdir, type): total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv' - UMIs = pd.DataFrame(total_fq, sep='\t') + UMIs = pd.read_csv(total_fq, sep='\t') - all_UMIs = UMIs['UMIs_count'].sum() + all_UMIs = UMIs['UMIs_count'].tolist() + medians = int(np.median(all_UMIs)) + all_UMIs = sum(all_UMIs) + stat_file = outdir + '/../04.go_assemble/stat.txt' + go_assemble_summary = [] + if type == 'TCR': TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') TRA_UMIs = [get_umi_count(fq) for fq in TRAs] TRB_UMIs = [get_umi_count(fq) for fq in TRBs] TRA_UMIs_count = sum(TRA_UMIs) - TRA_ = format(TRA_UMIs_count, ',') + medianA = int(np.median(TRA_UMIs)) TRB_UMIs_count = sum(TRB_UMIs) - TRB_ = format(TRB_UMIs_count, ',') - - TRA_mapping = TRA_UMIs_count/all_UMIs - TRA_mapping = round(TRA_mapping, 4) - TRA_mapping = f'{TRA_}({TRA_mapping})' - - TRB_mapping = TRB_UMIs_count/all_UMIs - TRB_mapping = round(TRB_mapping, 4) - TRB_mapping = f'{TRB_}({TRB_mapping})' - + medianB = int(np.median(TRB_UMIs)) total_counts = TRA_UMIs_count + TRB_UMIs_count - total_ = format(total_counts, ',') - total_mapping = (total_counts)/all_UMIs - total_mapping = round(total_mapping, 4) - total_mapping = f'{total_}({total_mapping})' - stat_text = pd.DataFrame({ - 'item': ['UMIs mapped to TRA or TRB', 'UMIs mapped to TRA', 'UMIs mapped to TRB'], 'count': [total_mapping, TRA_mapping, TRB_mapping] - }, columns=['item', 'count']) + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRA', + 'count': TRA_UMIs_count, + 'total_count': total_counts, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRB', + 'count': TRB_UMIs_count, + 'total_count': total_counts, + }) - stat_text.to_csv(stat_file, sep=':', header=None, index=False) + with open(f'{outdir}/tmp.txt', 'w') as f: + f.write(f'Madian UMIs per cell:{medians}\n') + f.write(f'Median TRA UMIs per cell:{medianA}\n') + f.write(f'Median TRB UMIs per cell:{medianB}\n') elif type == 'BCR': IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') @@ -182,45 +187,47 @@ def get_assemble_stat(outdir, type): IGK_UMIs = [get_umi_count(fq) for fq in IGKs] IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - IGH = sum(IGH_UMIs) - IGH_ = format(IGH, ',') + medianH = np.median(IGH_UMIs) IGK = sum(IGK_UMIs) - IGK_ = format(IGK, ',') + medianK = np.median(IGK_UMIs) IGL = sum(IGL_UMIs) - IGL_ = format(IGL, ',') - - IGH_mapping = IGH/all_UMIs - IGH_mapping = round(IGH_mapping, 4) - IGH_mapping = f'{IGH_}({IGH_mapping})' - - IGK_mapping = IGK/all_UMIs - IGK_mapping = round(IGK_mapping, 4) - IGK_mapping = f'{IGK_}({IGK_mapping})' - - IGL_mapping = IGL/all_UMIs - IGL_mapping = round(IGL_mapping, 4) - IGL_mapping = f'{IGL_}({IGL_mapping})' + medianL = np.median(IGL_UMIs) total_counts = IGH + IGK + IGL - total_ = format(total_counts, ',') - - total_mapping = (total_counts)/all_UMIs - total_mapping = round(total_mapping, 4) - total_mapping = f'{total_}({total_mapping})' - stat_text = pd.DataFrame({ - 'item': ['UMIs mapped to IGH, IGK or IGL', 'UMIs mapped to IGH', 'UMIs mapped to IGK', 'UMIs mapped to IGL'], 'count': [total_mapping, IGH_mapping, IGK_mapping, IGL_mapping] + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGH', + 'count': IGH, + 'total_count': total_counts, }) - stat_text.to_csv(stat_file, sep=':', header=None, index=False) - + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGK', + 'count': IGK, + 'total_count': total_counts, + }) + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGL', + 'count': IGL, + 'total_count': total_counts, + }) + with open(f'{outdir}/tmp.txt', 'w') as f: + f.write(f'Median UMIs per cell:{medians}\n') + f.write(f'Median IGH UMIs per Cell:{medianH}\n') + f.write(f'Median IGK UMIs per Cell:{medianK}\n') + f.write(f'Median IGL UMIs per Cell:{medianL}\n') + + df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) + utils.gen_stat(df, stat_file) def go_assemble(args): + step_name = 'go_assemble' + step = Step(args, step_name) thread = int(args.thread) fastq_dir = args.fastq_dir outdir = args.outdir @@ -232,6 +239,9 @@ def go_assemble(args): elif type == 'BCR': run_bracer(outdir, fastq_dir, species, thread) + go_assemble_summary(outdir, type) + + step.clean_up() def get_opts_go_assemble(parser, sub_program): if sub_program: diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 803e97ab..661e84df 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -6,15 +6,15 @@ import datetime import pandas as pd from Bio.Seq import Seq import glob +import re +import numpy as np from celescope.tools import utils from celescope.tools.Step import Step, s_common import glob - def tpm_count(ass_dir): - rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recom' # ass_dir outdir/sample/04.go_assemble - f'binants.txt', sep='\t') + rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') # ass_dir outdir/sample/04.go_assemble productive = rec[rec['productive'] == True] productive['TPM'] = '' indx = list(productive.index) @@ -53,6 +53,7 @@ def filtering(type, ass_dir, outdir): trb = trb.sort_values(by='TPM', ascending=False) trb = trb.head(1) filtered = filtered.append(trb, ignore_index=True) + filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') elif type == 'BCR': @@ -81,10 +82,43 @@ def filtering(type, ass_dir, outdir): return filtered -def res_sum(type, ass_dir, outdir): +@utils.add_log +def vdj_sum(args): + + step_name = f"vdj_sum" + step = Step(args, step_name) + + type = args.type + ass_dir = args.ass_dir + sample = args.sample + outdir = args.outdir + fastq_dir = args.fastq_dir + filtered = filtering(type, ass_dir, outdir) + fqs = glob.glob(f'{fastq_dir}/*.fq') + matched_bcs = len(fqs) + + stat_file = outdir + '/stat.txt' + + vdj_sum_summary = [] + if type == 'TCR': + + CB = filtered['cell_name'].tolist() + + df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t') + + all_cells = df_umi['cell_name'].tolist() + + df_umi = df_umi.set_index('cell_name') + + for i in all_cells: + if i in CB: + df_umi.loc[i, 'mark'] = 'CB' + else: + df_umi.loc[i, 'mark'] = 'UB' + count_a = filtered[filtered['locus'] == 'A'].shape[0] count_b = filtered[filtered['locus'] == 'B'].shape[0] paired_cell = pd.DataFrame(filtered['cell_name'].value_counts()) @@ -110,21 +144,89 @@ def res_sum(type, ass_dir, outdir): per_count_data = pd.DataFrame() per_count_data['cdr3s_aa'] = aaseqs - clone_count = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts()) - clone_count.columns = ["frequency"] - proportation = [] - sum = clone_count['frequency'].sum() - for f in list(clone_count['frequency']): + clonetypes = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts()) + clonetypes.columns = ["Frequency"] + Percent = [] + sum = clonetypes['Frequency'].sum() + for f in list(clonetypes['Frequency']): p = f/sum - proportation.append(p) - clone_count['proportation'] = proportation - clone_count = clone_count.reset_index() - clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t') + Percent.append(p) + clonetypes['Percent'] = Percent + clonetypes = clonetypes.reset_index() + clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True) + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + + vdj_sum_summary.append({ + 'item': 'Estimated Number of Cells', + 'count': matched_bcs, + 'total_count': matched_bcs, + }) + + vdj_sum_summary.append({ + 'item': 'Productive cells', + 'count': productive_cells, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with TRA', + 'count': count_a, + 'total_count': matched_bcs, + }) + + vdj_sum_summary.append({ + 'item': 'Cells with TRB', + 'count': count_b, + 'total_count': matched_bcs, + }) + + vdj_sum_summary.append({ + 'item': 'Cells with paired TRA and TRB', + 'count': len(paired_cell), + 'total_count': matched_bcs, + }) + + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians = [] + for line in f: + line = line.rstrip('\n').split(':') + medians.append(int(line[1])) + + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRA UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRB UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) - return productive_cells, count_a, count_b, paired_cell elif type == 'BCR': + + CB = filtered['CELL'].tolist() + + df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t') + + all_cells = df_umi['cell_name'].tolist() + + df_umi = df_umi.set_index('cell_name') + + for i in all_cells: + if i in CB: + df_umi.loc[i, 'mark'] = 'CB' + else: + df_umi.loc[i, 'mark'] = 'UB' + filtered_h = filtered[filtered['LOCUS'] == 'H'] filtered_k = filtered[filtered['LOCUS'] == 'K'] filtered_l = filtered[filtered['LOCUS'] == 'L'] @@ -174,57 +276,133 @@ def res_sum(type, ass_dir, outdir): clones['CELLS'] = cells clones["cdr3s_aa"] = aaseqs - clone_count = pd.DataFrame(clones['cdr3s_aa'].value_counts()) - clone_count.columns = ["frequency"] - proportation = [] - sum = clone_count['frequency'].sum() - for f in list(clone_count['frequency']): + clonetypes = pd.DataFrame(clones['cdr3s_aa'].value_counts()) + clonetypes.columns = ["Frequency"] + Percent = [] + sum = clonetypes['Frequency'].sum() + for f in list(clonetypes['Frequency']): p = f/sum - proportation.append(p) - clone_count['proportation'] = proportation - clone_count = clone_count.reset_index() - clone_count.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clone_count.to_csv(f'{outdir}/clone_count.tsv', sep='\t') - - return productive_cells, filtered_h_count, filtered_k_count, filtered_l_count, paired_k, paired_l + Percent.append(p) + clonetypes['Percent'] = Percent + clonetypes = clonetypes.reset_index() + clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True) + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + + + vdj_sum_summary.append({ + 'item': 'Matched cells', + 'count': matched_bcs, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Productive cells', + 'count': productive_cells, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGH', + 'count': filtered_h_count, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGK', + 'count': filtered_k_count, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGL', + 'count': filtered_l_count, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGH and IGK', + 'count': paired_k, + 'total_count': matched_bcs + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGH and IGL', + 'count': paired_l, + 'total_count': matched_bcs + }) + + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians=[] + for line in f: + line = line.strip('\n').split(':') + medians.append(int(line[1])) + + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGH UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGK UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGL UMIs per cell', + 'count': medians[3], + 'total_count': np.nan + }) + + df = pd.DataFrame(vdj_sum_summary, + columns=['item', 'count', 'total_count']) + + df['count'] = df['count'].apply(int) + + df['percent'] = df['count']/(df.total_count.astype('float')) * 100 + df['percent'] = df['percent'].apply( + lambda x: round(x, 2) + ) + df['count'] = df['count'].apply(utils.format_number) -def get_stat(fastq_dir, ass_dir, outdir, type): - fqs = glob.glob(f'{fastq_dir}/*.fq') - matched_bcs = len(fqs) + def percent_str_func(row): + need_percent = bool( + re.search("Cells with", row["item"], flags=re.IGNORECASE)) + if need_percent: + return "(" + str(row["percent"]) + "%)" + else: + return "" - stat_file = outdir + '/stat.txt' - if type == 'TCR': - productive_cells, TRA_num, TRB_num, paired_num = res_sum(type, ass_dir, outdir) + df['percent_str'] = df.apply( + lambda row: percent_str_func(row), axis=1 + ) - stat_text = pd.DataFrame({ - 'item': ['Matched cells', 'Productive cells', 'Cells with TRA', 'Cells with TRB', 'Cells with paired TRA and TRB'], - 'count': [matched_bcs, productive_cells, TRA_num, TRB_num, paired_num] - }, - columns=['item', 'count']) - stat_text.to_csv(stat_file, sep=':', header=None, index=False) + def gen_stat(summary, stat_file): + stat = summary + stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] + stat = stat.loc[:, ["item", "new_count"]] + stat.to_csv(stat_file, sep=":", header=None, index=False) - elif type == 'BCR': - productive_cells, H_num, K_num, L_num, H_K_num, H_L_num = res_sum(type, ass_dir,outdir) + gen_stat(df, stat_file) - stat_text = pd.DataFrame({ - 'item': ['Matched cells', 'Productive cells', 'Cells with IGH', 'Cells with IGK', 'Cells with IGL', 'Cells with IGH and IGK', 'Cells with IGH and IGL'], - 'count': [matched_bcs, productive_cells, H_num, K_num, L_num, H_K_num, H_L_num] - }, - columns=['item', 'count']) - stat_text.to_csv(stat_file, sep=":", header=None, index=False) +# clonetype table + clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(x*100) + '%') + title = 'Clonetypes' + table_dict = step.get_table(title, 'clonetypes_table', clonetypes) -@utils.add_log -def vdj_sum(args): - type = args.type - ass_dir = args.ass_dir - sample = args.sample - outdir = args.outdir - fastq_dir = args.fastq_dir + step.add_data_item(table_dict=table_dict) - get_stat(fastq_dir, ass_dir, outdir, type) + step.clean_up() def get_opts_vdj_sum(parser, sub_program): -- Gitee From 32abee2adb3e8570b49c0ec3a9a5bf3154381a99 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 3 Jun 2021 18:19:42 +0800 Subject: [PATCH 23/96] add vdj_sum (clonetypes table and cell) to reports --- celescope/templates/html/tracer_vdj/base.html | 10 ++--- .../html/tracer_vdj/clonetypes_table.html | 37 +++++++++++++++++++ .../html/tracer_vdj/vdj_sum_summary.html | 29 +++++++++++++++ celescope/tracer_vdj/vdj_sum.py | 2 +- 4 files changed, 72 insertions(+), 6 deletions(-) create mode 100644 celescope/templates/html/tracer_vdj/clonetypes_table.html create mode 100644 celescope/templates/html/tracer_vdj/vdj_sum_summary.html diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html index 54ad567e..5318bb34 100755 --- a/celescope/templates/html/tracer_vdj/base.html +++ b/celescope/templates/html/tracer_vdj/base.html @@ -137,18 +137,18 @@ {% include "html/common/cutadapt_summary.html"%} {% endif %} - {% if consensus_summary is defined %} - {% include "html/common/consensus_summary.html"%} - {% endif %} - {% if split_fastq is defined %} {% include "html/tracer_vdj/split_fastq_summary.html"%} {% endif %} {% if go_assemble_summary is defined %} {% include "html/tracer_vdj/go_assemble_summary.html"%} - {% endif %} + {% endif %} + {% if vdj_sum_summary is defined %} + {% include "html/tracer_vdj/vdj_sum_summary.html"%} + {% endif %} + {% if table_dict is defined %} {% include "html/vdj/clonetypes_table.html"%} {% endif %} diff --git a/celescope/templates/html/tracer_vdj/clonetypes_table.html b/celescope/templates/html/tracer_vdj/clonetypes_table.html new file mode 100644 index 00000000..c4510563 --- /dev/null +++ b/celescope/templates/html/tracer_vdj/clonetypes_table.html @@ -0,0 +1,37 @@ + + + + + + + + + + + + + + + +
+

{{ table_dict['title'] }}

+
+ +
+ {{table_dict['table'] | safe}} +
+ +
+
+
\ No newline at end of file diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html new file mode 100644 index 00000000..3c79d24f --- /dev/null +++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html @@ -0,0 +1,29 @@ +
+

Cell

+
+ + + {% for item in vdj_sum_summary %} + + {% for i in item %} + + {% endfor %} + + {% endfor %} +
{{ i|e }}
+ +
+
+
\ No newline at end of file diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 661e84df..aec8dde6 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -396,7 +396,7 @@ def vdj_sum(args): # clonetype table - clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(x*100) + '%') + clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(round(x*100, 2)) + '%') title = 'Clonetypes' table_dict = step.get_table(title, 'clonetypes_table', clonetypes) -- Gitee From f2f275a2ec447ba4be1d54d8cc2d780ecbc20fad Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 4 Jun 2021 16:37:42 +0800 Subject: [PATCH 24/96] report1.0 total:TCR/BCR data, cell:matched barcode --- .../html/tracer_vdj/go_assemble_summary.html | 34 +++++++++++++++++++ .../html/tracer_vdj/vdj_sum_summary.html | 6 ++++ celescope/tracer_vdj/go_assemble.py | 15 ++++---- celescope/tracer_vdj/split_fastq.py | 22 +++++++----- celescope/tracer_vdj/vdj_sum.py | 34 +++++-------------- 5 files changed, 68 insertions(+), 43 deletions(-) create mode 100644 celescope/templates/html/tracer_vdj/go_assemble_summary.html diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html new file mode 100644 index 00000000..3a1cdb11 --- /dev/null +++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html @@ -0,0 +1,34 @@ +
+

Mapping

+
+ + + {% for item in go_assemble_summary %} + {% if loop.index <= (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+ + + {% for item in go_assemble_summary %} + {% if loop.index > (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+
+
+
\ No newline at end of file diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html index 3c79d24f..f8881c7d 100644 --- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html +++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html @@ -24,6 +24,12 @@ {% endfor %} +
+ {{ chart|safe }} +
+ +
+
\ No newline at end of file diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index cc8912a3..5750f192 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -44,6 +44,7 @@ def bracer(fq, outdir, species): f'--single_end ' f'--small_index ' f'--no_trimming ' + f'-r ' f'--species {species} ' f'-c {BRACER_CONF} ' f'{prefix} ' @@ -75,6 +76,7 @@ def tracer(fq, outdir, species): f'--single_end ' f'--small_index ' f'-m assembly ' + f'-r ' f'--species {species} ' f'-c {CONF_PATH} ' f'{fq} ' @@ -159,18 +161,17 @@ def go_assemble_summary(outdir, type): medianA = int(np.median(TRA_UMIs)) TRB_UMIs_count = sum(TRB_UMIs) medianB = int(np.median(TRB_UMIs)) - total_counts = TRA_UMIs_count + TRB_UMIs_count go_assemble_summary.append({ 'item': f'UMIs mapped to TRA', 'count': TRA_UMIs_count, - 'total_count': total_counts, + 'total_count': all_UMIs, }) go_assemble_summary.append({ 'item': f'UMIs mapped to TRB', 'count': TRB_UMIs_count, - 'total_count': total_counts, + 'total_count': all_UMIs, }) with open(f'{outdir}/tmp.txt', 'w') as f: @@ -194,24 +195,22 @@ def go_assemble_summary(outdir, type): IGL = sum(IGL_UMIs) medianL = np.median(IGL_UMIs) - total_counts = IGH + IGK + IGL - go_assemble_summary.append({ 'item': f'UMIs mapped to IGH', 'count': IGH, - 'total_count': total_counts, + 'total_count': all_UMIs, }) go_assemble_summary.append({ 'item': f'UMIs mapped to IGK', 'count': IGK, - 'total_count': total_counts, + 'total_count': all_UMIs, }) go_assemble_summary.append({ 'item': f'UMIs mapped to IGL', 'count': IGL, - 'total_count': total_counts, + 'total_count': all_UMIs, }) with open(f'{outdir}/tmp.txt', 'w') as f: diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index 699be4c5..e9076627 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -60,6 +60,7 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): reads_count_dict = {} # all barcodes and reads num for each barcode umi_count_dict = defaultdict(list) umi_count = {} + with pysam.FastxFile(fq) as fq: for entry in fq: attr = entry.name.split('_') @@ -67,22 +68,25 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): umi = attr[1] if barcode in barcodes: barcode_reads_dict[barcode].append(entry) - if umi_count_dict[barcode].count(umi) == 0: - umi_count_dict[barcode].append(umi) + if umi_count_dict[barcode].count(umi) == 0: + umi_count_dict[barcode].append(umi) for barcode in barcodes: reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) - + + for barcode in list(umi_count_dict.keys()): umi_count[barcode] = len(umi_count_dict[barcode]) - df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMIs_count']) - df_umi = df_umi.reset_index().rename(columns={'index': 'barcode'}) + df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) + df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'}) + + df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') - reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['reads_count']) - reads_count = reads_count.reset_index().rename(columns={'index': 'barcode'}) + reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount']) + reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'}) - df_f = pd.merge(reads_count, df_umi, on='barcode', how='inner') + df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner') - df_f = df_f.set_index('barcode') + df_f = df_f.set_index('Barcode') i = 1 diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index aec8dde6..759b0e02 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -11,6 +11,8 @@ import numpy as np from celescope.tools import utils from celescope.tools.Step import Step, s_common import glob +from celescope.tools.cellranger3 import get_plot_elements +import json def tpm_count(ass_dir): @@ -93,6 +95,7 @@ def vdj_sum(args): sample = args.sample outdir = args.outdir fastq_dir = args.fastq_dir + UMI_min = args.UMI_min filtered = filtering(type, ass_dir, outdir) @@ -102,22 +105,12 @@ def vdj_sum(args): stat_file = outdir + '/stat.txt' vdj_sum_summary = [] + + count_umi = f'{fastq_dir}/../umi_count.tsv' if type == 'TCR': - CB = filtered['cell_name'].tolist() - - df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t') - - all_cells = df_umi['cell_name'].tolist() - - df_umi = df_umi.set_index('cell_name') - - for i in all_cells: - if i in CB: - df_umi.loc[i, 'mark'] = 'CB' - else: - df_umi.loc[i, 'mark'] = 'UB' + step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi)) count_a = filtered[filtered['locus'] == 'A'].shape[0] count_b = filtered[filtered['locus'] == 'B'].shape[0] @@ -213,19 +206,7 @@ def vdj_sum(args): elif type == 'BCR': - CB = filtered['CELL'].tolist() - - df_umi = pd.read_csv(f'{outdir}/../03.split_fastq/reads_count.tsv', sep='\t') - - all_cells = df_umi['cell_name'].tolist() - - df_umi = df_umi.set_index('cell_name') - - for i in all_cells: - if i in CB: - df_umi.loc[i, 'mark'] = 'CB' - else: - df_umi.loc[i, 'mark'] = 'UB' + step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi)) filtered_h = filtered[filtered['LOCUS'] == 'H'] filtered_k = filtered[filtered['LOCUS'] == 'K'] @@ -411,6 +392,7 @@ def get_opts_vdj_sum(parser, sub_program): parser.add_argument('--ass_dir', help='assemble dir', required=True) parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + parser.add_argument('--UMI_min', help='int, min UMI per cell, if not set, will be counted by UMI rank 20', default='auto') -- Gitee From 3764a63fe9462052075724b933ff29ee44c7657f Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 4 Jun 2021 16:46:38 +0800 Subject: [PATCH 25/96] ranked by UMI --- celescope/tracer_vdj/split_fastq.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index e9076627..ebcff1f6 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -76,14 +76,20 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): for barcode in list(umi_count_dict.keys()): umi_count[barcode] = len(umi_count_dict[barcode]) - df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) + df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) + df_umi = df_umi.sort_values(by='UMI', ascending=False) df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'}) - df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') - reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount']) reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'}) + CB = reads_count['Barcode'].tolist() + + df_umi['mark'] = df_umi["Barcode"].apply( + lambda x: "CB" if (x in CB) else "UB") + + df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') + df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner') df_f = df_f.set_index('Barcode') -- Gitee From 1797cc7b8e41bac10718d25da849be0baf7bd784 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 7 Jun 2021 13:22:38 +0800 Subject: [PATCH 26/96] rewrite clonetype table and productive cells for UMI --- .../html/tracer_vdj/go_assemble_summary.html | 6 + celescope/tracer_vdj/go_assemble.py | 16 ++ celescope/tracer_vdj/split_fastq.py | 81 +++--- celescope/tracer_vdj/vdj_sum.py | 243 ++++++++++-------- 4 files changed, 199 insertions(+), 147 deletions(-) diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html index 3a1cdb11..768043e2 100644 --- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html +++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html @@ -2,9 +2,15 @@

Mapping

{% for item in go_assemble_summary %} diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 5750f192..c456bae1 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -162,6 +162,14 @@ def go_assemble_summary(outdir, type): TRB_UMIs_count = sum(TRB_UMIs) medianB = int(np.median(TRB_UMIs)) + totals = TRA_UMIs_count + TRB_UMIs_count + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to TRA or TRB', + 'count': totals, + 'total_count': all_UMIs, + }) + go_assemble_summary.append({ 'item': f'UMIs mapped to TRA', 'count': TRA_UMIs_count, @@ -195,6 +203,14 @@ def go_assemble_summary(outdir, type): IGL = sum(IGL_UMIs) medianL = np.median(IGL_UMIs) + totals = IGH + IGK + IGL + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to IGH, IGL or IGK', + 'count': totals, + 'total_count': all_UMIs, + }) + go_assemble_summary.append({ 'item': f'UMIs mapped to IGH', 'count': IGH, diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index ebcff1f6..ce496264 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -11,17 +11,21 @@ from celescope.tools.Step import Step, s_common @utils.add_log -def annotation_barcodes(match_dir, type): +def get_barcodes(match_dir, type): + """ + get reversed barcodes + VDJ barcodes and RNA barcodes are complementary and reversed + """ - cluster_data = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv') - cluster_data = cluster_data[0] - cluster_type = pd.read_csv(cluster_data, sep='\t') + clusterFile = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/*_auto_cluster_type.tsv') + clusterFile = clusterFile[0] + cluster_data = pd.read_csv(clusterFile, sep='\t') # filter barcodes if type == 'TCR': - clusters = list(cluster_type[cluster_type['cell_type'] == 'T cells']['cluster']) + clusters = cluster_data[cluster_data['cell_type'] == 'T cells']['cluster'].tolist() elif type == 'BCR': - clusters = list(cluster_type[cluster_type['cell_type'] == 'B cells']['cluster']) + clusters = cluster_data[cluster_data['cell_type'] == 'B cells']['cluster'].tolist() tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv') tsne = tsne[0] @@ -32,11 +36,11 @@ def annotation_barcodes(match_dir, type): tmp = tsne_coord[tsne_coord['cluster'] == cluster].index.tolist() barcodes += tmp # write barcodes - barcodes_path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/') - barcodes_path = barcodes_path[0] + path = glob.glob(f'{match_dir}/06.analysis/*_auto_assign/') + path = path[0] res = [] - with open(f'{barcodes_path}/reversed_barcodes.tsv', 'w') as fh: + with open(f'{path}/reversed_barcodes.tsv', 'w') as fh: for barcode in barcodes: barcode = Seq(barcode) barcode_reversed = barcode.reverse_complement() @@ -48,52 +52,52 @@ def annotation_barcodes(match_dir, type): @utils.add_log -def get_fastq_to_assemble(fq_outdir, fq, barcodes): +def get_fqs(fq_outdir, fq, barcodes): """ split_fastq + split clean fq from cutadapt by procided barcodes + -Input: + fq_outdir, splited fq file out dir. + fq, clean fq file. + barcodes, reversed barcodes from RNA data. + -Output: + 'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark. + 'fastq' dir, contains fqs. """ if not os.path.exists(fq_outdir): os.makedirs(fq_outdir) - barcode_reads_dict = defaultdict(list) # all barcodes from BCR vdj_dir paired with reads - # umi_count = defaultdict(list) - reads_count_dict = {} # all barcodes and reads num for each barcode - umi_count_dict = defaultdict(list) - umi_count = {} + barcode_reads_dict = defaultdict(list) # reads from VDJ data for each barcode + reads_count_dict = {} # reads count for each barcode + + umi_dict = defaultdict(list) # umi list for each barcode + umi_count = {} # umi count for each barcode with pysam.FastxFile(fq) as fq: for entry in fq: attr = entry.name.split('_') barcode = attr[0] umi = attr[1] - if barcode in barcodes: - barcode_reads_dict[barcode].append(entry) - if umi_count_dict[barcode].count(umi) == 0: - umi_count_dict[barcode].append(umi) - for barcode in barcodes: + barcode_reads_dict[barcode].append(entry) + if umi_dict[barcode].count(umi) == 0: + umi_dict[barcode].append(umi) + + for barcode in list(umi_dict.keys()): reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) - - for barcode in list(umi_count_dict.keys()): - umi_count[barcode] = len(umi_count_dict[barcode]) + umi_count[barcode] = len(umi_dict[barcode]) df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) - df_umi = df_umi.sort_values(by='UMI', ascending=False) df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'}) reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount']) reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'}) - CB = reads_count['Barcode'].tolist() - - df_umi['mark'] = df_umi["Barcode"].apply( - lambda x: "CB" if (x in CB) else "UB") - - df_umi.to_csv(f'{fq_outdir}/../umi_count.tsv', sep='\t') - df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner') df_f = df_f.set_index('Barcode') + df_f = df_f.sort_values(by='UMI', ascending=False) + i = 1 for barcode in barcodes: @@ -105,17 +109,18 @@ def get_fastq_to_assemble(fq_outdir, fq, barcodes): f.write(str(entry) + '\n') if i % 1000 == 0: - get_fastq_to_assemble.logger.info(f'processed {i} cells') + get_fqs.logger.info(f'processed {i} cells') if i == len(barcodes): - get_fastq_to_assemble.logger.info(f'finally get {i} cells') + get_fqs.logger.info(f'finally get {i} cells') i += 1 + + df_f['cell_name'].fillna(0, inplace=True) df_f = df_f.astype(int) - df_f.to_csv(f'{fq_outdir}/../reads_count.tsv', sep='\t') - - + df_f.to_csv(f'{fq_outdir}/../count.txt', sep='\t') + def split_fastq(args): type = args.type @@ -126,9 +131,9 @@ def split_fastq(args): fq = args.fq fq_outdir = f'{outdir}/fastq' - barcodes = annotation_barcodes(match_dir, type) + barcodes = get_barcodes(match_dir, type) - get_fastq_to_assemble(fq_outdir, fq, barcodes) + get_fqs(fq_outdir, fq, barcodes) def get_opts_split_fastq(parser, sub_program): diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 759b0e02..29c08cf3 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -106,77 +106,94 @@ def vdj_sum(args): vdj_sum_summary = [] - count_umi = f'{fastq_dir}/../umi_count.tsv' + count_umi_file = f'{fastq_dir}/../count.txt' + + count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) + + all_cells = count_umi.shape[0] if type == 'TCR': - step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi)) - - count_a = filtered[filtered['locus'] == 'A'].shape[0] - count_b = filtered[filtered['locus'] == 'B'].shape[0] - paired_cell = pd.DataFrame(filtered['cell_name'].value_counts()) - productive_cells = paired_cell.shape[0] - unpaired_cell = paired_cell[paired_cell['cell_name'] == 1] - paired_cell = paired_cell[paired_cell['cell_name'] == 2] - paired_cell = list(paired_cell.index) - - aaseqs = [] - for cell in paired_cell: - temp = filtered[filtered['cell_name'] == cell] - temp_loci = list(temp['locus']) - temp_aaseq = list(temp['CDR3aa']) - string = 'TR{}:C{}F;TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0], temp_loci[1], temp_aaseq[1]) - aaseqs.append(string) - - for cell in list(unpaired_cell.index): - temp = filtered[filtered['cell_name'] == cell] - temp_loci = list(temp['locus']) - temp_aaseq = list(temp['CDR3aa']) - string = 'TR{}:C{}F'.format(temp_loci[0], temp_aaseq[0]) - aaseqs.append(string) - - per_count_data = pd.DataFrame() - per_count_data['cdr3s_aa'] = aaseqs - clonetypes = pd.DataFrame(per_count_data['cdr3s_aa'].value_counts()) - clonetypes.columns = ["Frequency"] - Percent = [] + productive_cells = set(filtered['cell_name'].tolist()) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') + + step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) + + productive_cells_num = len(productive_cells) + + TRA_chain = filtered[filtered['locus'] == 'A'] + TRA_chain_num = TRA_chain.shape[0] + TRB_chain = filtered[filtered['locus'] == 'B'] + TRB_chain_num = TRB_chain.shape[0] + + TRAs, TRBs = [], [] + paired_cell = 0 + for cell in productive_cells: + tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] + if tmp1.empty is not True: + chainA = tmp1['CDR3aa'].tolist()[0] + TRAs.append(chainA) + else: + TRAs.append('NaN') + + tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] + if tmp2.empty is not True: + chainB = tmp2['CDR3aa'].tolist()[0] + TRBs.append(chainB) + else: + TRBs.append('NaN') + + if not tmp1.empty and not tmp2.empty: + paired_cell += 1 + + clonetypes_table = pd.DataFrame() + clonetypes_table['TRA_chain'] = TRAs + clonetypes_table['TRB_chain'] = TRBs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) + sum = clonetypes['Frequency'].sum() + proportions = [] for f in list(clonetypes['Frequency']): p = f/sum - Percent.append(p) - clonetypes['Percent'] = Percent + p = round(p, 4) + p = str(p * 100) + '%' + proportions.append(p) + clonetypes['Proportion'] = proportions + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) clonetypes = clonetypes.reset_index() - clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - vdj_sum_summary.append({ - 'item': 'Estimated Number of Cells', - 'count': matched_bcs, - 'total_count': matched_bcs, - }) + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) + + clonetypes.to_csv(f'{outdir}/clonetypes.txt', sep='\t') vdj_sum_summary.append({ - 'item': 'Productive cells', - 'count': productive_cells, - 'total_count': matched_bcs + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': all_cells, }) vdj_sum_summary.append({ 'item': 'Cells with TRA', - 'count': count_a, - 'total_count': matched_bcs, + 'count': TRA_chain_num, + 'total_count': all_cells, }) vdj_sum_summary.append({ 'item': 'Cells with TRB', - 'count': count_b, - 'total_count': matched_bcs, + 'count': TRB_chain_num, + 'total_count': all_cells, }) vdj_sum_summary.append({ 'item': 'Cells with paired TRA and TRB', - 'count': len(paired_cell), - 'total_count': matched_bcs, + 'count': paired_cell, + 'total_count': all_cells, }) with open(f'{ass_dir}/tmp.txt', 'r') as f: @@ -206,7 +223,15 @@ def vdj_sum(args): elif type == 'BCR': - step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi)) + productive_cells = set(filtered['CELL'].tolist()) + + productive_cells_num = len(productive_cells) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') + + step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) filtered_h = filtered[filtered['LOCUS'] == 'H'] filtered_k = filtered[filtered['LOCUS'] == 'K'] @@ -215,101 +240,102 @@ def vdj_sum(args): filtered_k_count = filtered_k.shape[0] filtered_l_count = filtered_l.shape[0] - paired_cell = pd.DataFrame(filtered['CELL'].value_counts()) - productive_cells = paired_cell.shape[0] + IGHs, IGKs, IGLs = [], [], [] - paired_cell = pd.DataFrame(filtered['CELL'].value_counts()) - productive_cells = paired_cell.shape[0] - unpaired_cell = paired_cell[paired_cell['CELL'] == 1] - paired_cell = paired_cell[paired_cell['CELL'] == 2] - paired_k = 0 - paired_l = 0 + paired_k, paired_l = 0, 0 - clones = pd.DataFrame() - cells = list(paired_cell.index) - aaseqs = [] + for cell in productive_cells: + tmp1 = filtered_h[filtered_h['CELL'] == cell] + if tmp1.empty is not True: + seq = tmp1['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGHs.append(aaseq) + else: + IGHs.append('NaN') + + tmp2 = filtered_l[filtered_l['CELL'] == cell] + if tmp2.empty is not True: + seq = tmp2['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGLs.append(aaseq) + else: + IGLs.append('NaN') + + tmp3 = filtered_k[filtered_k['CELL'] == cell] + if tmp3.empty is not True: + seq = tmp3['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGKs.append(aaseq) + else: + IGKs.append('NaN') - for cell in cells: - if 'K' in list(filtered[filtered['CELL'] == cell]['LOCUS']): - paired_k += 1 - elif 'L' in list(filtered[filtered['CELL'] == cell]['LOCUS']): + if not tmp1.empty and not tmp2.empty: paired_l += 1 - tep = filtered[filtered['CELL'] == cell] - tep_loci = list(tep['LOCUS']) - cdr3 = list(tep['JUNCTION']) - aaseq = [] - for seq in cdr3: - seq = Seq(seq) - seq = seq.translate() - aaseq.append(seq) - string = 'IG{}:{};IG{}:{}'.format(tep_loci[0], aaseq[0], tep_loci[1], aaseq[1]) - aaseqs.append(string) - - for cell in list(unpaired_cell.index): - cells.append(cell) - locus = list(filtered[filtered['CELL'] == cell]['LOCUS']) - cdr3 = list(filtered[filtered['CELL'] == cell]['JUNCTION']) - seq = Seq(cdr3[0]) - seq = seq.translate() - string = 'IG{}:{}'.format(locus[0], seq) - aaseqs.append(string) - - clones['CELLS'] = cells - - clones["cdr3s_aa"] = aaseqs - clonetypes = pd.DataFrame(clones['cdr3s_aa'].value_counts()) - clonetypes.columns = ["Frequency"] - Percent = [] + if not tmp1.empty and not tmp3.empty: + paired_k += 1 + + clonetypes_table = pd.DataFrame() + + clonetypes_table['IGH_chain'] = IGHs + clonetypes_table['IGL_chain'] = IGLs + clonetypes_table['IGK_chain'] = IGKs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) + + Proportion = [] sum = clonetypes['Frequency'].sum() for f in list(clonetypes['Frequency']): p = f/sum - Percent.append(p) - clonetypes['Percent'] = Percent + p = round(p, 4) + p = str(p*100) + '%' + Proportion.append(p) + clonetypes['Proportion'] = Proportion + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) clonetypes = clonetypes.reset_index() - clonetypes.rename(columns={'index': 'cdr3s_aa'}, inplace=True) - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - vdj_sum_summary.append({ - 'item': 'Matched cells', - 'count': matched_bcs, - 'total_count': matched_bcs - }) vdj_sum_summary.append({ - 'item': 'Productive cells', - 'count': productive_cells, - 'total_count': matched_bcs + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': all_cells }) vdj_sum_summary.append({ 'item': 'Cells with IGH', 'count': filtered_h_count, - 'total_count': matched_bcs + 'total_count': all_cells }) vdj_sum_summary.append({ 'item': 'Cells with IGK', 'count': filtered_k_count, - 'total_count': matched_bcs + 'total_count': all_cells }) vdj_sum_summary.append({ 'item': 'Cells with IGL', 'count': filtered_l_count, - 'total_count': matched_bcs + 'total_count': all_cells }) vdj_sum_summary.append({ 'item': 'Cells with IGH and IGK', 'count': paired_k, - 'total_count': matched_bcs + 'total_count': all_cells }) vdj_sum_summary.append({ 'item': 'Cells with IGH and IGL', 'count': paired_l, - 'total_count': matched_bcs + 'total_count': all_cells }) with open(f'{ass_dir}/tmp.txt', 'r') as f: @@ -377,7 +403,6 @@ def vdj_sum(args): # clonetype table - clonetypes['Percent'] = clonetypes['Percent'].apply(lambda x: str(round(x*100, 2)) + '%') title = 'Clonetypes' table_dict = step.get_table(title, 'clonetypes_table', clonetypes) -- Gitee From 3a41427f093a351da8c06b85c06621fd975ab694 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 7 Jun 2021 15:03:57 +0800 Subject: [PATCH 27/96] class split_fastq --- celescope/tracer_vdj/split_fastq.py | 157 ++++++++++++++++------------ 1 file changed, 89 insertions(+), 68 deletions(-) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index ce496264..ac11ce47 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -10,7 +10,6 @@ from celescope.tools import utils from celescope.tools.Step import Step, s_common -@utils.add_log def get_barcodes(match_dir, type): """ get reversed barcodes @@ -51,97 +50,119 @@ def get_barcodes(match_dir, type): return res -@utils.add_log -def get_fqs(fq_outdir, fq, barcodes): +class Split_fastq(Step): """ - split_fastq - split clean fq from cutadapt by procided barcodes - -Input: - fq_outdir, splited fq file out dir. - fq, clean fq file. - barcodes, reversed barcodes from RNA data. - -Output: - 'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark. - 'fastq' dir, contains fqs. + Features + + - Get reversed barcodes from RNA annotation results. + - Split clean R2 fastq file and count reads and UMIs for each barcode. + + Output + + - `03.split_fastq/count.txt`, 4 columns, barcodes, reads count, UMIs count, mark. + - `03.split_fastq/fastq`, fastq file directory for each barcode in reversed barcodes. """ - if not os.path.exists(fq_outdir): - os.makedirs(fq_outdir) - - barcode_reads_dict = defaultdict(list) # reads from VDJ data for each barcode - reads_count_dict = {} # reads count for each barcode - - umi_dict = defaultdict(list) # umi list for each barcode - umi_count = {} # umi count for each barcode - - with pysam.FastxFile(fq) as fq: - for entry in fq: - attr = entry.name.split('_') - barcode = attr[0] - umi = attr[1] - barcode_reads_dict[barcode].append(entry) - if umi_dict[barcode].count(umi) == 0: - umi_dict[barcode].append(umi) - - for barcode in list(umi_dict.keys()): - reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) - umi_count[barcode] = len(umi_dict[barcode]) - df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) - df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'}) + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.type = args.type + self.fq = args.fq + self.match_dir = args.match_dir + self.fq_outdir = f'{self.outdir}/fastq' + + # out file name + self.count_file = f'{self.outdir}/{self.sample}_count.txt' + + @utils.add_log + def get_fqs(self): + """ + split_fastq + split clean fq from cutadapt by procided barcodes + -Input: + fq_outdir, splited fq file out dir. + fq, clean fq file. + barcodes, reversed barcodes from RNA data. + -Output: + 'umi_count.tsv', 4 cols, Barcode, readcount, UMI, mark. + 'fastq' dir, contains fqs. + """ + if not os.path.exists(self.fq_outdir): + os.makedirs(self.fq_outdir) + + barcodes = get_barcodes(self.match_dir, self.type) + + barcode_reads_dict = defaultdict(list) # reads from VDJ data for each barcode + reads_count_dict = {} # reads count for each barcode + + umi_dict = defaultdict(list) # umi list for each barcode + umi_count = {} # umi count for each barcode + + with pysam.FastxFile(self.fq) as fq: + for entry in fq: + attr = entry.name.split('_') + barcode = attr[0] + umi = attr[1] + barcode_reads_dict[barcode].append(entry) + if umi_dict[barcode].count(umi) == 0: + umi_dict[barcode].append(umi) + + for barcode in list(umi_dict.keys()): + reads_count_dict[barcode] = len(barcode_reads_dict[barcode]) + umi_count[barcode] = len(umi_dict[barcode]) - reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount']) - reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'}) + df_umi = pd.DataFrame.from_dict(umi_count, orient='index',columns=['UMI']) + df_umi = df_umi.reset_index().rename(columns={'index': 'Barcode'}) - df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner') + reads_count = pd.DataFrame.from_dict(reads_count_dict, orient='index',columns=['readcount']) + reads_count = reads_count.reset_index().rename(columns={'index': 'Barcode'}) - df_f = df_f.set_index('Barcode') + df_f = pd.merge(reads_count, df_umi, on='Barcode', how='inner') - df_f = df_f.sort_values(by='UMI', ascending=False) + df_f = df_f.set_index('Barcode') - i = 1 + df_f = df_f.sort_values(by='UMI', ascending=False) - for barcode in barcodes: + i = 1 - df_f.loc[barcode, 'cell_name'] = i + for barcode in barcodes: + + df_f.loc[barcode, 'cell_name'] = i - with open(f'{fq_outdir}/{i}.fq', 'w') as f: - for entry in barcode_reads_dict[barcode]: - f.write(str(entry) + '\n') + with open(f'{self.fq_outdir}/{i}.fq', 'w') as f: + for entry in barcode_reads_dict[barcode]: + f.write(str(entry) + '\n') - if i % 1000 == 0: - get_fqs.logger.info(f'processed {i} cells') + if i % 1000 == 0: + Split_fastq.get_fqs.logger.info(f'processed {i} cells') - if i == len(barcodes): - get_fqs.logger.info(f'finally get {i} cells') + if i == len(barcodes): + Split_fastq.get_fqs.logger.info(f'finally get {i} cells') - i += 1 + i += 1 + + df_f['cell_name'].fillna(0, inplace=True) - df_f['cell_name'].fillna(0, inplace=True) - - df_f = df_f.astype(int) - df_f.to_csv(f'{fq_outdir}/../count.txt', sep='\t') - + df_f = df_f.astype(int) + df_f.to_csv(self.count_file, sep='\t') + self.clean_up() + + +@utils.add_log def split_fastq(args): - type = args.type - match_dir = args.match_dir - sample = args.sample - outdir = args.outdir - assay = args.assay - fq = args.fq - - fq_outdir = f'{outdir}/fastq' - barcodes = get_barcodes(match_dir, type) - - get_fqs(fq_outdir, fq, barcodes) + step_name = 'split_fastq' + split_fastq_obj = Split_fastq(args, step_name) + split_fastq_obj.get_fqs() -def get_opts_split_fastq(parser, sub_program): +def get_opts_split_fastq(parser, sub_program=True): if sub_program: parser = s_common(parser) parser.add_argument('--fq', required=True) parser.add_argument('--match_dir', help='matched rna_dir') parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + -- Gitee From cc5dcd5606b5c377a6a4f8c5e368c5cd06cb4a1d Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 7 Jun 2021 15:56:35 +0800 Subject: [PATCH 28/96] add class Go_assemble --- celescope/tracer_vdj/go_assemble.py | 315 +++++++++++++++------------- 1 file changed, 165 insertions(+), 150 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index c456bae1..9a025c40 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -19,6 +19,16 @@ BRACER_CONDA = 'bracer' BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' +def get_umi_count(fq): + umis = [] + with pysam.FastxFile(fq) as fh: + for entry in fh: + attr = entry.name.split('_') + barcode = attr[0] + umi = attr[1] + umis.append(umi) + res = len(set(umis)) + return res # 开始组装 @@ -86,37 +96,55 @@ def tracer(fq, outdir, species): os.system(cmd) -@utils.add_log -def run_tracer(outdir, fastq_dir, species, thread): +class Go_assemble(Step): + """ + Features - fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] - outdirs = [outdir] * len(fqs) - species = [species] * len(fqs) - if not os.path.exists(f'{outdir}/tracer'): - os.makedirs(f'{outdir}/tracer') + - Assemble TCR/BCR full length by tracer. + - Summary mapping rate. - all_res = [] - with ProcessPoolExecutor(thread) as pool: - for res in pool.map(tracer, fqs, outdirs, species): - all_res.append(res) + Output - tracer_summarise(outdir) + - `04.go_assemble/tracer` or `04.go_assemble/bracer` Tracer output directory. + - `04.go_assemble/stat.txt` Recording mapping rate. + """ + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + self.species = args.species + self.type = args.type + self.thread = int(args.thread) + self.fastq_dir = args.fastq_dir -@utils.add_log -def run_bracer(outdir, fastq_dir, species, thread): - fqs = [join(fastq_dir, f) for f in listdir(fastq_dir) if isfile(join(fastq_dir, f))] - outdirs = [outdir] * len(fqs) - species = [species] * len(fqs) - if not os.path.exists(f'{outdir}/bracer'): - os.makedirs(f'{outdir}/bracer') + def run_tracer(self): + + fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))] + outdirs = [self.outdir] * len(fqs) + species = [self.species] * len(fqs) + if not os.path.exists(f'{self.outdir}/tracer'): + os.makedirs(f'{self.outdir}/tracer') + + all_res = [] + with ProcessPoolExecutor(self.thread) as pool: + for res in pool.map(tracer, fqs, outdirs, species): + all_res.append(res) - all_res = [] - with ProcessPoolExecutor(thread) as pool: - for res in pool.map(bracer, fqs, outdirs, species): - all_res.append(res) + tracer_summarise(self.outdir) - bracer_summarise(outdir) + + def run_bracer(self): + fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))] + outdirs = [self.outdir] * len(fqs) + species = [self.species] * len(fqs) + if not os.path.exists(f'{self.outdir}/bracer'): + os.makedirs(f'{self.outdir}/bracer') + + all_res = [] + with ProcessPoolExecutor(self.thread) as pool: + for res in pool.map(bracer, fqs, outdirs, species): + all_res.append(res) + + bracer_summarise(self.outdir) ################def get_reads_count(fq): @@ -126,137 +154,124 @@ def run_bracer(outdir, fastq_dir, species, thread): # count += 1 # return count - -def get_umi_count(fq): - umis = [] - with pysam.FastxFile(fq) as fh: - for entry in fh: - attr = entry.name.split('_') - barcode = attr[0] - umi = attr[1] - umis.append(umi) - res = len(set(umis)) - return res - - -def go_assemble_summary(outdir, type): - - total_fq = f'{outdir}/../03.split_fastq/reads_count.tsv' - UMIs = pd.read_csv(total_fq, sep='\t') - - all_UMIs = UMIs['UMIs_count'].tolist() - medians = int(np.median(all_UMIs)) - all_UMIs = sum(all_UMIs) - - stat_file = outdir + '/../04.go_assemble/stat.txt' - - go_assemble_summary = [] - - if type == 'TCR': - TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') - TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') - TRA_UMIs = [get_umi_count(fq) for fq in TRAs] - TRB_UMIs = [get_umi_count(fq) for fq in TRBs] - TRA_UMIs_count = sum(TRA_UMIs) - medianA = int(np.median(TRA_UMIs)) - TRB_UMIs_count = sum(TRB_UMIs) - medianB = int(np.median(TRB_UMIs)) - - totals = TRA_UMIs_count + TRB_UMIs_count - - go_assemble_summary.append({ - 'item': f'All UMIs mapped to TRA or TRB', - 'count': totals, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to TRA', - 'count': TRA_UMIs_count, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to TRB', - 'count': TRB_UMIs_count, - 'total_count': all_UMIs, - }) - - with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Madian UMIs per cell:{medians}\n') - f.write(f'Median TRA UMIs per cell:{medianA}\n') - f.write(f'Median TRB UMIs per cell:{medianB}\n') - - elif type == 'BCR': - IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') - IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') - IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') - - IGH_UMIs = [get_umi_count(fq) for fq in IGHs] - IGK_UMIs = [get_umi_count(fq) for fq in IGKs] - IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - - IGH = sum(IGH_UMIs) - medianH = np.median(IGH_UMIs) - IGK = sum(IGK_UMIs) - medianK = np.median(IGK_UMIs) - IGL = sum(IGL_UMIs) - medianL = np.median(IGL_UMIs) - - totals = IGH + IGK + IGL - - go_assemble_summary.append({ - 'item': f'All UMIs mapped to IGH, IGL or IGK', - 'count': totals, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGH', - 'count': IGH, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGK', - 'count': IGK, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGL', - 'count': IGL, - 'total_count': all_UMIs, - }) - - with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Median UMIs per cell:{medians}\n') - f.write(f'Median IGH UMIs per Cell:{medianH}\n') - f.write(f'Median IGK UMIs per Cell:{medianK}\n') - f.write(f'Median IGL UMIs per Cell:{medianL}\n') - - df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) - - utils.gen_stat(df, stat_file) + def go_assemble_summary(self): + + count_file = f'{self.outdir}/../03.split_fastq/{self.sample}_count.txt' + UMIs = pd.read_csv(count_file, sep='\t') + + all_UMIs = UMIs['UMIs_count'].tolist() + medians = int(np.median(all_UMIs)) + all_UMIs = sum(all_UMIs) + + stat_file = self.outdir + '/stat.txt' + + go_assemble_summary = [] + + if type == 'TCR': + TRAs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') + TRBs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') + TRA_UMIs = [get_umi_count(fq) for fq in TRAs] + TRB_UMIs = [get_umi_count(fq) for fq in TRBs] + TRA_UMIs_count = sum(TRA_UMIs) + medianA = int(np.median(TRA_UMIs)) + TRB_UMIs_count = sum(TRB_UMIs) + medianB = int(np.median(TRB_UMIs)) + + totals = TRA_UMIs_count + TRB_UMIs_count + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to TRA or TRB', + 'count': totals, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRA', + 'count': TRA_UMIs_count, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRB', + 'count': TRB_UMIs_count, + 'total_count': all_UMIs, + }) + + with open(f'{self.outdir}/tmp.txt', 'w') as f: + f.write(f'Madian UMIs per cell:{medians}\n') + f.write(f'Median TRA UMIs per cell:{medianA}\n') + f.write(f'Median TRB UMIs per cell:{medianB}\n') + + elif type == 'BCR': + IGHs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') + IGKs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') + IGLs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') + + IGH_UMIs = [get_umi_count(fq) for fq in IGHs] + IGK_UMIs = [get_umi_count(fq) for fq in IGKs] + IGL_UMIs = [get_umi_count(fq) for fq in IGLs] + + IGH = sum(IGH_UMIs) + medianH = np.median(IGH_UMIs) + IGK = sum(IGK_UMIs) + medianK = np.median(IGK_UMIs) + IGL = sum(IGL_UMIs) + medianL = np.median(IGL_UMIs) + + totals = IGH + IGK + IGL + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to IGH, IGL or IGK', + 'count': totals, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGH', + 'count': IGH, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGK', + 'count': IGK, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGL', + 'count': IGL, + 'total_count': all_UMIs, + }) + + with open(f'{self.outdir}/tmp.txt', 'w') as f: + f.write(f'Median UMIs per cell:{medians}\n') + f.write(f'Median IGH UMIs per Cell:{medianH}\n') + f.write(f'Median IGK UMIs per Cell:{medianK}\n') + f.write(f'Median IGL UMIs per Cell:{medianL}\n') + + df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) + + utils.gen_stat(df, stat_file) + + self.clean_up() + + + @utils.add_log + def run(self): + if self.type == 'TCR': + self.run_tracer() + elif self.type == 'BCR': + self.run_bracer() + self.go_assemble_summary() +@utils.add_log def go_assemble(args): step_name = 'go_assemble' - step = Step(args, step_name) - thread = int(args.thread) - fastq_dir = args.fastq_dir - outdir = args.outdir - species = args.species + go_assemble_obj = Go_assemble(args, step_name) + go_assemble_obj.run() - type = args.type - if type == 'TCR': - run_tracer(outdir, fastq_dir, species, thread) - elif type == 'BCR': - run_bracer(outdir, fastq_dir, species, thread) - - go_assemble_summary(outdir, type) - - step.clean_up() def get_opts_go_assemble(parser, sub_program): if sub_program: -- Gitee From 78b942b8cdd9354035938340f606bbd271b4fdfe Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 7 Jun 2021 17:04:46 +0800 Subject: [PATCH 29/96] add Go_assemble class --- celescope/tracer_vdj/go_assemble.py | 247 +++++++++++++++------------- 1 file changed, 135 insertions(+), 112 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 9a025c40..4d0fe95a 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -19,6 +19,23 @@ BRACER_CONDA = 'bracer' BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' + +def gen_stat(summary, stat_file): + stat = summary + stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] + stat = stat.loc[:, ["item", "new_count"]] + stat.to_csv(stat_file, sep=":", header=None, index=False) + + +def percent_str_func(row): + need_percent = bool( + re.search("Cells with", row["item"], flags=re.IGNORECASE)) + if need_percent: + return "(" + str(row["percent"]) + "%)" + else: + return "" + + def get_umi_count(fq): umis = [] with pysam.FastxFile(fq) as fh: @@ -29,7 +46,121 @@ def get_umi_count(fq): umis.append(umi) res = len(set(umis)) return res -# 开始组装 + + +def assemble_summary(outdir, sample, type): + + count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt' + UMIs = pd.read_csv(count_file, sep='\t') + + all_ = UMIs['UMI'].tolist() + medians = int(np.median(all_)) + all_UMIs = sum(all_) + + stat_file = outdir + '/stat.txt' + + go_assemble_summary = [] + + if type == 'TCR': + TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') + TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') + TRA_UMIs = [get_umi_count(fq) for fq in TRAs] + TRB_UMIs = [get_umi_count(fq) for fq in TRBs] + TRA_UMIs_count = sum(TRA_UMIs) + medianA = int(np.median(TRA_UMIs)) + TRB_UMIs_count = sum(TRB_UMIs) + medianB = int(np.median(TRB_UMIs)) + + totals = TRA_UMIs_count + TRB_UMIs_count + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to TRA or TRB', + 'count': totals, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRA', + 'count': TRA_UMIs_count, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to TRB', + 'count': TRB_UMIs_count, + 'total_count': all_UMIs, + }) + + with open(f'{outdir}/tmp.txt', 'w') as f: + f.write(f'Madian UMIs per cell:{medians}\n') + f.write(f'Median TRA UMIs per cell:{medianA}\n') + f.write(f'Median TRB UMIs per cell:{medianB}\n') + + elif type == 'BCR': + IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') + IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') + IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') + + IGH_UMIs = [get_umi_count(fq) for fq in IGHs] + IGK_UMIs = [get_umi_count(fq) for fq in IGKs] + IGL_UMIs = [get_umi_count(fq) for fq in IGLs] + + IGH = sum(IGH_UMIs) + medianH = np.median(IGH_UMIs) + IGK = sum(IGK_UMIs) + medianK = np.median(IGK_UMIs) + IGL = sum(IGL_UMIs) + medianL = np.median(IGL_UMIs) + + totals = IGH + IGK + IGL + + go_assemble_summary.append({ + 'item': f'All UMIs mapped to IGH, IGL or IGK', + 'count': totals, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGH', + 'count': IGH, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGK', + 'count': IGK, + 'total_count': all_UMIs, + }) + + go_assemble_summary.append({ + 'item': f'UMIs mapped to IGL', + 'count': IGL, + 'total_count': all_UMIs, + }) + + with open(f'{outdir}/tmp.txt', 'w') as f: + f.write(f'Median UMIs per cell:{medians}\n') + f.write(f'Median IGH UMIs per Cell:{medianH}\n') + f.write(f'Median IGK UMIs per Cell:{medianK}\n') + f.write(f'Median IGL UMIs per Cell:{medianL}\n') + + df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) + + df['count'] = df['count'].apply(int) + + df['percent'] = df['count']/(df.total_count.astype('float')) * 100 + + df['percent'] = df['percent'].apply( + lambda x: round(x, 2) + ) + df['count'] = df['count'].apply(utils.format_number) + + df['percent_str'] = df.apply( + lambda row: percent_str_func(row), axis=1 + ) + + gen_stat(df, stat_file) + def bracer_summarise(outdir): @@ -131,6 +262,8 @@ class Go_assemble(Step): tracer_summarise(self.outdir) + assemble_summary(self.outdir, self.sample, self.type) + def run_bracer(self): fqs = [join(self.fastq_dir, f) for f in listdir(self.fastq_dir) if isfile(join(self.fastq_dir, f))] @@ -146,116 +279,7 @@ class Go_assemble(Step): bracer_summarise(self.outdir) - -################def get_reads_count(fq): -# with pysam.FastxFile(fq) as fh: -# count = 0 -# for entry in fh: -# count += 1 -# return count - - def go_assemble_summary(self): - - count_file = f'{self.outdir}/../03.split_fastq/{self.sample}_count.txt' - UMIs = pd.read_csv(count_file, sep='\t') - - all_UMIs = UMIs['UMIs_count'].tolist() - medians = int(np.median(all_UMIs)) - all_UMIs = sum(all_UMIs) - - stat_file = self.outdir + '/stat.txt' - - go_assemble_summary = [] - - if type == 'TCR': - TRAs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') - TRBs = glob.glob(f'{self.outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') - TRA_UMIs = [get_umi_count(fq) for fq in TRAs] - TRB_UMIs = [get_umi_count(fq) for fq in TRBs] - TRA_UMIs_count = sum(TRA_UMIs) - medianA = int(np.median(TRA_UMIs)) - TRB_UMIs_count = sum(TRB_UMIs) - medianB = int(np.median(TRB_UMIs)) - - totals = TRA_UMIs_count + TRB_UMIs_count - - go_assemble_summary.append({ - 'item': f'All UMIs mapped to TRA or TRB', - 'count': totals, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to TRA', - 'count': TRA_UMIs_count, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to TRB', - 'count': TRB_UMIs_count, - 'total_count': all_UMIs, - }) - - with open(f'{self.outdir}/tmp.txt', 'w') as f: - f.write(f'Madian UMIs per cell:{medians}\n') - f.write(f'Median TRA UMIs per cell:{medianA}\n') - f.write(f'Median TRB UMIs per cell:{medianB}\n') - - elif type == 'BCR': - IGHs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') - IGKs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') - IGLs = glob.glob(f'{self.outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') - - IGH_UMIs = [get_umi_count(fq) for fq in IGHs] - IGK_UMIs = [get_umi_count(fq) for fq in IGKs] - IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - - IGH = sum(IGH_UMIs) - medianH = np.median(IGH_UMIs) - IGK = sum(IGK_UMIs) - medianK = np.median(IGK_UMIs) - IGL = sum(IGL_UMIs) - medianL = np.median(IGL_UMIs) - - totals = IGH + IGK + IGL - - go_assemble_summary.append({ - 'item': f'All UMIs mapped to IGH, IGL or IGK', - 'count': totals, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGH', - 'count': IGH, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGK', - 'count': IGK, - 'total_count': all_UMIs, - }) - - go_assemble_summary.append({ - 'item': f'UMIs mapped to IGL', - 'count': IGL, - 'total_count': all_UMIs, - }) - - with open(f'{self.outdir}/tmp.txt', 'w') as f: - f.write(f'Median UMIs per cell:{medians}\n') - f.write(f'Median IGH UMIs per Cell:{medianH}\n') - f.write(f'Median IGK UMIs per Cell:{medianK}\n') - f.write(f'Median IGL UMIs per Cell:{medianL}\n') - - df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) - - utils.gen_stat(df, stat_file) - - self.clean_up() - + assemble_summary(self.outdir, self.sample, self.type) @utils.add_log def run(self): @@ -263,7 +287,6 @@ class Go_assemble(Step): self.run_tracer() elif self.type == 'BCR': self.run_bracer() - self.go_assemble_summary() @utils.add_log -- Gitee From 7f1fc4e2f9aa71aeab08cd74e6436248c6b0ed8e Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 7 Jun 2021 17:54:57 +0800 Subject: [PATCH 30/96] add class vdj_sum and unify html text --- .../html/tracer_vdj/go_assemble_summary.html | 14 +- .../html/tracer_vdj/vdj_sum_summary.html | 30 +- celescope/tracer_vdj/go_assemble.py | 22 +- celescope/tracer_vdj/vdj_sum.py | 565 +++++++++--------- 4 files changed, 316 insertions(+), 315 deletions(-) diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html index 768043e2..5bb8c0bf 100644 --- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html +++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html @@ -3,14 +3,14 @@
{% for item in go_assemble_summary %} diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html index f8881c7d..7b5fdb08 100644 --- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html +++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html @@ -2,17 +2,25 @@

Cell

{% for item in vdj_sum_summary %} diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 4d0fe95a..264febb0 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -74,7 +74,7 @@ def assemble_summary(outdir, sample, type): totals = TRA_UMIs_count + TRB_UMIs_count go_assemble_summary.append({ - 'item': f'All UMIs mapped to TRA or TRB', + 'item': f'All UMIs mapped to TRA and TRB', 'count': totals, 'total_count': all_UMIs, }) @@ -115,7 +115,7 @@ def assemble_summary(outdir, sample, type): totals = IGH + IGK + IGL go_assemble_summary.append({ - 'item': f'All UMIs mapped to IGH, IGL or IGK', + 'item': f'All UMIs mapped to IGH, IGL and IGK', 'count': totals, 'total_count': all_UMIs, }) @@ -146,21 +146,7 @@ def assemble_summary(outdir, sample, type): df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) - df['count'] = df['count'].apply(int) - - df['percent'] = df['count']/(df.total_count.astype('float')) * 100 - - df['percent'] = df['percent'].apply( - lambda x: round(x, 2) - ) - df['count'] = df['count'].apply(utils.format_number) - - df['percent_str'] = df.apply( - lambda row: percent_str_func(row), axis=1 - ) - - gen_stat(df, stat_file) - + utils.gen_stat(df, stat_file) def bracer_summarise(outdir): @@ -288,6 +274,8 @@ class Go_assemble(Step): elif self.type == 'BCR': self.run_bracer() + self.clean_up() + @utils.add_log def go_assemble(args): diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 29c08cf3..62492d27 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -13,6 +13,7 @@ from celescope.tools.Step import Step, s_common import glob from celescope.tools.cellranger3 import get_plot_elements import json +from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat def tpm_count(ass_dir): @@ -82,333 +83,337 @@ def filtering(type, ass_dir, outdir): filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') return filtered + +class Vdj_sum(Step): + """ + Features -@utils.add_log -def vdj_sum(args): + - Filter tracer results by TPM. + - Calculate clonetypes. - step_name = f"vdj_sum" - step = Step(args, step_name) + Output - type = args.type - ass_dir = args.ass_dir - sample = args.sample - outdir = args.outdir - fastq_dir = args.fastq_dir - UMI_min = args.UMI_min + - `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus. + - `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion. + """ + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + self.type = args.type + self.fastq_dir = args.fastq_dir + self.ass_dir = args.ass_dir - filtered = filtering(type, ass_dir, outdir) - fqs = glob.glob(f'{fastq_dir}/*.fq') - matched_bcs = len(fqs) + @utils.add_log + def run(self): + ass_dir = self.ass_dir + outdir = self.outdir + fastq_dir = self.fastq_dir + type = self.type - stat_file = outdir + '/stat.txt' + results = filtering(type, ass_dir, outdir) - vdj_sum_summary = [] - - count_umi_file = f'{fastq_dir}/../count.txt' + stat_file = outdir + '/stat.txt' - count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) - - all_cells = count_umi.shape[0] + vdj_sum_summary = [] + + count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt' - if type == 'TCR': + count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) + + all_cells = count_umi.shape[0] + + if type == 'TCR': + + productive_cells = set(results['cell_name'].tolist()) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') - productive_cells = set(filtered['cell_name'].tolist()) - - count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") - - count_umi.to_csv(count_umi_file, sep='\t') - - step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - - productive_cells_num = len(productive_cells) - - TRA_chain = filtered[filtered['locus'] == 'A'] - TRA_chain_num = TRA_chain.shape[0] - TRB_chain = filtered[filtered['locus'] == 'B'] - TRB_chain_num = TRB_chain.shape[0] - - TRAs, TRBs = [], [] - paired_cell = 0 - for cell in productive_cells: - tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] - if tmp1.empty is not True: - chainA = tmp1['CDR3aa'].tolist()[0] - TRAs.append(chainA) - else: - TRAs.append('NaN') - - tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] - if tmp2.empty is not True: - chainB = tmp2['CDR3aa'].tolist()[0] - TRBs.append(chainB) - else: - TRBs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_cell += 1 - - clonetypes_table = pd.DataFrame() - clonetypes_table['TRA_chain'] = TRAs - clonetypes_table['TRB_chain'] = TRBs - clonetypes_table['Frequency'] = '' - - clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) - - sum = clonetypes['Frequency'].sum() - proportions = [] - for f in list(clonetypes['Frequency']): - p = f/sum - p = round(p, 4) - p = str(p * 100) + '%' - proportions.append(p) - clonetypes['Proportion'] = proportions - clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) - clonetypes = clonetypes.reset_index() - - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) - - clonetypes.to_csv(f'{outdir}/clonetypes.txt', sep='\t') - - vdj_sum_summary.append({ - 'item': 'Estimated Number of Cells', - 'count': productive_cells_num, - 'total_count': all_cells, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with TRA', - 'count': TRA_chain_num, - 'total_count': all_cells, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with TRB', - 'count': TRB_chain_num, - 'total_count': all_cells, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with paired TRA and TRB', - 'count': paired_cell, - 'total_count': all_cells, - }) - - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians = [] - for line in f: - line = line.rstrip('\n').split(':') - medians.append(int(line[1])) + self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) + + productive_cells_num = len(productive_cells) + + TRA_chain = results[results['locus'] == 'A'] + TRA_chain_num = TRA_chain.shape[0] + TRB_chain = results[results['locus'] == 'B'] + TRB_chain_num = TRB_chain.shape[0] + + TRAs, TRBs = [], [] + paired_cell = 0 + for cell in productive_cells: + tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] + if tmp1.empty is not True: + chainA = tmp1['CDR3aa'].tolist()[0] + TRAs.append(chainA) + else: + TRAs.append('NaN') + + tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] + if tmp2.empty is not True: + chainB = tmp2['CDR3aa'].tolist()[0] + TRBs.append(chainB) + else: + TRBs.append('NaN') + + if not tmp1.empty and not tmp2.empty: + paired_cell += 1 + + clonetypes_table = pd.DataFrame() + clonetypes_table['TRA_chain'] = TRAs + clonetypes_table['TRB_chain'] = TRBs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) + + sum = clonetypes['Frequency'].sum() + proportions = [] + for f in list(clonetypes['Frequency']): + p = f/sum + p = round(p, 4) + p = str(p * 100) + '%' + proportions.append(p) + clonetypes['Proportion'] = proportions + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) + clonetypes = clonetypes.reset_index() + + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': all_cells, }) vdj_sum_summary.append({ - 'item': 'Median TRA UMIs per cell', - 'count': medians[1], - 'total_count': np.nan + 'item': 'Cells with TRA', + 'count': TRA_chain_num, + 'total_count': all_cells, }) vdj_sum_summary.append({ - 'item': 'Median TRB UMIs per cell', - 'count': medians[2], - 'total_count': np.nan + 'item': 'Cells with TRB', + 'count': TRB_chain_num, + 'total_count': all_cells, }) + vdj_sum_summary.append({ + 'item': 'Cells with paired TRA and TRB', + 'count': paired_cell, + 'total_count': all_cells, + }) - elif type == 'BCR': + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians = [] + for line in f: + line = line.rstrip('\n').split(':') + medians.append(int(line[1])) + + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRA UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRB UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) + + + elif type == 'BCR': + + productive_cells = set(results['CELL'].tolist()) + + productive_cells_num = len(productive_cells) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') + + self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) + + results_h = results[results['LOCUS'] == 'H'] + results_k = results[results['LOCUS'] == 'K'] + results_l = results[results['LOCUS'] == 'L'] + results_h_count = results_h.shape[0] + results_k_count = results_k.shape[0] + results_l_count = results_l.shape[0] + + IGHs, IGKs, IGLs = [], [], [] + + paired_k, paired_l = 0, 0 + + for cell in productive_cells: + tmp1 = results_h[results_h['CELL'] == cell] + if tmp1.empty is not True: + seq = tmp1['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGHs.append(aaseq) + else: + IGHs.append('NaN') + + tmp2 = results_l[results_l['CELL'] == cell] + if tmp2.empty is not True: + seq = tmp2['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGLs.append(aaseq) + else: + IGLs.append('NaN') + + tmp3 = results_k[results_k['CELL'] == cell] + if tmp3.empty is not True: + seq = tmp3['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGKs.append(aaseq) + else: + IGKs.append('NaN') + + if not tmp1.empty and not tmp2.empty: + paired_l += 1 + if not tmp1.empty and not tmp3.empty: + paired_k += 1 + + clonetypes_table = pd.DataFrame() + + clonetypes_table['IGH_chain'] = IGHs + clonetypes_table['IGL_chain'] = IGLs + clonetypes_table['IGK_chain'] = IGKs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) + + Proportion = [] + sum = clonetypes['Frequency'].sum() + for f in list(clonetypes['Frequency']): + p = f/sum + p = round(p, 4) + p = str(p*100) + '%' + Proportion.append(p) + clonetypes['Proportion'] = Proportion + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) + clonetypes = clonetypes.reset_index() + + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - productive_cells = set(filtered['CELL'].tolist()) - - productive_cells_num = len(productive_cells) - - count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") - - count_umi.to_csv(count_umi_file, sep='\t') - - step.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - - filtered_h = filtered[filtered['LOCUS'] == 'H'] - filtered_k = filtered[filtered['LOCUS'] == 'K'] - filtered_l = filtered[filtered['LOCUS'] == 'L'] - filtered_h_count = filtered_h.shape[0] - filtered_k_count = filtered_k.shape[0] - filtered_l_count = filtered_l.shape[0] - - IGHs, IGKs, IGLs = [], [], [] - - paired_k, paired_l = 0, 0 - - for cell in productive_cells: - tmp1 = filtered_h[filtered_h['CELL'] == cell] - if tmp1.empty is not True: - seq = tmp1['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGHs.append(aaseq) - else: - IGHs.append('NaN') - - tmp2 = filtered_l[filtered_l['CELL'] == cell] - if tmp2.empty is not True: - seq = tmp2['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGLs.append(aaseq) - else: - IGLs.append('NaN') - - tmp3 = filtered_k[filtered_k['CELL'] == cell] - if tmp3.empty is not True: - seq = tmp3['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGKs.append(aaseq) - else: - IGKs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_l += 1 - if not tmp1.empty and not tmp3.empty: - paired_k += 1 - - clonetypes_table = pd.DataFrame() - - clonetypes_table['IGH_chain'] = IGHs - clonetypes_table['IGL_chain'] = IGLs - clonetypes_table['IGK_chain'] = IGKs - clonetypes_table['Frequency'] = '' - - clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) - - Proportion = [] - sum = clonetypes['Frequency'].sum() - for f in list(clonetypes['Frequency']): - p = f/sum - p = round(p, 4) - p = str(p*100) + '%' - Proportion.append(p) - clonetypes['Proportion'] = Proportion - clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) - clonetypes = clonetypes.reset_index() - - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - - - vdj_sum_summary.append({ - 'item': 'Estimated Number of Cells', - 'count': productive_cells_num, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGH', - 'count': filtered_h_count, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGK', - 'count': filtered_k_count, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGL', - 'count': filtered_l_count, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGH and IGK', - 'count': paired_k, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGH and IGL', - 'count': paired_l, - 'total_count': all_cells - }) - - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians=[] - for line in f: - line = line.strip('\n').split(':') - medians.append(int(line[1])) vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': all_cells }) vdj_sum_summary.append({ - 'item': 'Median IGH UMIs per cell', - 'count': medians[1], - 'total_count': np.nan + 'item': 'Cells with IGH', + 'count': results_h_count, + 'total_count': all_cells + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGK', + 'count': results_k_count, + 'total_count': all_cells }) vdj_sum_summary.append({ - 'item': 'Median IGK UMIs per cell', - 'count': medians[2], - 'total_count': np.nan + 'item': 'Cells with IGL', + 'count': results_l_count, + 'total_count': all_cells + }) + + vdj_sum_summary.append({ + 'item': 'Cells with paired IGH and IGK', + 'count': paired_k, + 'total_count': all_cells }) vdj_sum_summary.append({ - 'item': 'Median IGL UMIs per cell', - 'count': medians[3], - 'total_count': np.nan + 'item': 'Cells with paired IGH and IGL', + 'count': paired_l, + 'total_count': all_cells }) - df = pd.DataFrame(vdj_sum_summary, - columns=['item', 'count', 'total_count']) + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians=[] + for line in f: + line = line.strip('\n').split(':') + medians.append(int(line[1])) - df['count'] = df['count'].apply(int) - - df['percent'] = df['count']/(df.total_count.astype('float')) * 100 + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGH UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGK UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) - df['percent'] = df['percent'].apply( - lambda x: round(x, 2) - ) - df['count'] = df['count'].apply(utils.format_number) + vdj_sum_summary.append({ + 'item': 'Median IGL UMIs per cell', + 'count': medians[3], + 'total_count': np.nan + }) + df = pd.DataFrame(vdj_sum_summary, + columns=['item', 'count', 'total_count']) - def percent_str_func(row): - need_percent = bool( - re.search("Cells with", row["item"], flags=re.IGNORECASE)) - if need_percent: - return "(" + str(row["percent"]) + "%)" - else: - return "" + df['count'] = df['count'].apply(int) + + df['percent'] = df['count']/(df.total_count.astype('float')) * 100 - df['percent_str'] = df.apply( - lambda row: percent_str_func(row), axis=1 - ) + df['percent'] = df['percent'].apply( + lambda x: round(x, 2) + ) + df['count'] = df['count'].apply(utils.format_number) - def gen_stat(summary, stat_file): - stat = summary - stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] - stat = stat.loc[:, ["item", "new_count"]] - stat.to_csv(stat_file, sep=":", header=None, index=False) + df['percent_str'] = df.apply( + lambda row: percent_str_func(row), axis=1 + ) - gen_stat(df, stat_file) + gen_stat(df, stat_file) -# clonetype table + # clonetype table - title = 'Clonetypes' - table_dict = step.get_table(title, 'clonetypes_table', clonetypes) + title = 'Clonetypes' + table_dict = self.get_table(title, 'clonetypes_table', clonetypes) - step.add_data_item(table_dict=table_dict) + self.add_data_item(table_dict=table_dict) - step.clean_up() + self.clean_up() + + os.remove(f'{ass_dir}/tmp.txt') + + +@utils.add_log +def vdj_sum(args): + step_name = 'vdj_sum' + vdj_sum_obj = Vdj_sum(args, step_name) + vdj_sum_obj.run() def get_opts_vdj_sum(parser, sub_program): -- Gitee From fcb0e22d67508240156dc359af2a74e4ace876cd Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 8 Jun 2021 16:32:17 +0800 Subject: [PATCH 31/96] solve vdj_sum and get_plot_elements warn --- .../tools/cellranger3/get_plot_elements.py | 3 ++ celescope/tracer_vdj/go_assemble.py | 35 ++++++++++--------- celescope/tracer_vdj/vdj_sum.py | 34 +++++++++--------- 3 files changed, 40 insertions(+), 32 deletions(-) diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py index f4431f51..391ea744 100755 --- a/celescope/tools/cellranger3/get_plot_elements.py +++ b/celescope/tools/cellranger3/get_plot_elements.py @@ -117,6 +117,9 @@ def segment_log_plot_by_length(y_data, x_start, x_end): this_segment_len = 0.0 segment_idx = [x_start] + np.seterr(divide = 'ignore') + np.seterr(invalid='ignore') + for i in range(x_start, x_end): last_i = max(x_start, i-1) dx = (np.log(i) - np.log(last_i)) / log_max_x diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 264febb0..b94a448e 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -48,14 +48,11 @@ def get_umi_count(fq): return res +@utils.add_log def assemble_summary(outdir, sample, type): count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt' UMIs = pd.read_csv(count_file, sep='\t') - - all_ = UMIs['UMI'].tolist() - medians = int(np.median(all_)) - all_UMIs = sum(all_) stat_file = outdir + '/stat.txt' @@ -71,28 +68,31 @@ def assemble_summary(outdir, sample, type): TRB_UMIs_count = sum(TRB_UMIs) medianB = int(np.median(TRB_UMIs)) + all_umi_count = TRA_UMIs + TRB_UMIs + medianAll = int(np.median(all_umi_count)) + totals = TRA_UMIs_count + TRB_UMIs_count go_assemble_summary.append({ 'item': f'All UMIs mapped to TRA and TRB', 'count': totals, - 'total_count': all_UMIs, + 'total_count': np.nan, }) go_assemble_summary.append({ 'item': f'UMIs mapped to TRA', 'count': TRA_UMIs_count, - 'total_count': all_UMIs, + 'total_count': totals, }) go_assemble_summary.append({ 'item': f'UMIs mapped to TRB', 'count': TRB_UMIs_count, - 'total_count': all_UMIs, + 'total_count': totals, }) with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Madian UMIs per cell:{medians}\n') + f.write(f'Madian UMIs per cell:{medianAll}\n') f.write(f'Median TRA UMIs per cell:{medianA}\n') f.write(f'Median TRB UMIs per cell:{medianB}\n') @@ -105,41 +105,44 @@ def assemble_summary(outdir, sample, type): IGK_UMIs = [get_umi_count(fq) for fq in IGKs] IGL_UMIs = [get_umi_count(fq) for fq in IGLs] + all_umi_count = IGH_UMIs + IGL_UMIs + IGK_UMIs + medianAll = int(np.median(all_umi_count)) + IGH = sum(IGH_UMIs) - medianH = np.median(IGH_UMIs) + medianH = int(np.median(IGH_UMIs)) IGK = sum(IGK_UMIs) - medianK = np.median(IGK_UMIs) + medianK = int(np.median(IGK_UMIs)) IGL = sum(IGL_UMIs) - medianL = np.median(IGL_UMIs) + medianL = int(np.median(IGL_UMIs)) totals = IGH + IGK + IGL go_assemble_summary.append({ 'item': f'All UMIs mapped to IGH, IGL and IGK', 'count': totals, - 'total_count': all_UMIs, + 'total_count': np.nan, }) go_assemble_summary.append({ 'item': f'UMIs mapped to IGH', 'count': IGH, - 'total_count': all_UMIs, + 'total_count': totals, }) go_assemble_summary.append({ 'item': f'UMIs mapped to IGK', 'count': IGK, - 'total_count': all_UMIs, + 'total_count': totals, }) go_assemble_summary.append({ 'item': f'UMIs mapped to IGL', 'count': IGL, - 'total_count': all_UMIs, + 'total_count': totals, }) with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Median UMIs per cell:{medians}\n') + f.write(f'Median UMIs per cell:{medianAll}\n') f.write(f'Median IGH UMIs per Cell:{medianH}\n') f.write(f'Median IGK UMIs per Cell:{medianK}\n') f.write(f'Median IGL UMIs per Cell:{medianL}\n') diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 62492d27..532ee516 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -19,18 +19,19 @@ from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat def tpm_count(ass_dir): rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') # ass_dir outdir/sample/04.go_assemble productive = rec[rec['productive'] == True] - productive['TPM'] = '' indx = list(productive.index) + tpms = [] for i in indx: - cell_name = productive.at[i, 'cell_name'] - rec_id = productive.at[i, 'recombinant_id'] + cell_name = productive.loc[i, 'cell_name'] + rec_id = productive.loc[i, 'recombinant_id'] with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf: for line in tsvf: if rec_id in line: line = line.rstrip() line = line.split('\t') tpm = float(line[4]) - productive.loc[i, 'TPM'] = tpm + tpms.append(tpm) + productive.insert(loc=productive.shape[1], column='TPM', value=tpms) return productive @@ -171,8 +172,9 @@ class Vdj_sum(Step): proportions = [] for f in list(clonetypes['Frequency']): p = f/sum - p = round(p, 4) - p = str(p * 100) + '%' + p = p * 100 + p = round(p, 2) + p = str(p) + '%' proportions.append(p) clonetypes['Proportion'] = proportions clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) @@ -192,19 +194,19 @@ class Vdj_sum(Step): vdj_sum_summary.append({ 'item': 'Cells with TRA', 'count': TRA_chain_num, - 'total_count': all_cells, + 'total_count': productive_cells_num, }) vdj_sum_summary.append({ 'item': 'Cells with TRB', 'count': TRB_chain_num, - 'total_count': all_cells, + 'total_count': productive_cells_num, }) vdj_sum_summary.append({ 'item': 'Cells with paired TRA and TRB', 'count': paired_cell, - 'total_count': all_cells, + 'total_count': productive_cells_num, }) with open(f'{ass_dir}/tmp.txt', 'r') as f: @@ -301,8 +303,9 @@ class Vdj_sum(Step): sum = clonetypes['Frequency'].sum() for f in list(clonetypes['Frequency']): p = f/sum - p = round(p, 4) - p = str(p*100) + '%' + p = p * 100 + p = round(p, 2) + p = str(p) + '%' Proportion.append(p) clonetypes['Proportion'] = Proportion clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) @@ -322,25 +325,25 @@ class Vdj_sum(Step): vdj_sum_summary.append({ 'item': 'Cells with IGH', 'count': results_h_count, - 'total_count': all_cells + 'total_count': productive_cells_num }) vdj_sum_summary.append({ 'item': 'Cells with IGK', 'count': results_k_count, - 'total_count': all_cells + 'total_count': productive_cells_num }) vdj_sum_summary.append({ 'item': 'Cells with IGL', 'count': results_l_count, - 'total_count': all_cells + 'total_count': productive_cells_num }) vdj_sum_summary.append({ 'item': 'Cells with paired IGH and IGK', 'count': paired_k, - 'total_count': all_cells + 'total_count': productive_cells_num }) vdj_sum_summary.append({ @@ -422,7 +425,6 @@ def get_opts_vdj_sum(parser, sub_program): parser.add_argument('--ass_dir', help='assemble dir', required=True) parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) - parser.add_argument('--UMI_min', help='int, min UMI per cell, if not set, will be counted by UMI rank 20', default='auto') -- Gitee From 56ce8b7f999b42eceff38c4ce01f3a8321cfbd49 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 8 Jun 2021 19:22:26 +0800 Subject: [PATCH 32/96] drop useless code --- celescope/tracer_vdj/go_assemble.py | 54 +++++++++++++---------------- celescope/tracer_vdj/split_fastq.py | 14 ++++---- celescope/tracer_vdj/vdj_sum.py | 39 +++++++++------------ 3 files changed, 46 insertions(+), 61 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index b94a448e..8bc63b18 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -1,11 +1,10 @@ -import argparse +import re +import pandas as pd import os from os import listdir from os.path import isfile, join from concurrent.futures import ProcessPoolExecutor from celescope.tools import utils -from celescope.tools.utils import * -import datetime import glob import pysam import numpy as np @@ -19,7 +18,6 @@ BRACER_CONDA = 'bracer' BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' - def gen_stat(summary, stat_file): stat = summary stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] @@ -28,12 +26,11 @@ def gen_stat(summary, stat_file): def percent_str_func(row): - need_percent = bool( - re.search("Cells with", row["item"], flags=re.IGNORECASE)) - if need_percent: - return "(" + str(row["percent"]) + "%)" - else: - return "" + need_percent = bool(re.search("Cells with", row["item"], flags=re.IGNORECASE)) + if need_percent: + return "(" + str(row["percent"]) + "%)" + else: + return "" def get_umi_count(fq): @@ -41,7 +38,6 @@ def get_umi_count(fq): with pysam.FastxFile(fq) as fh: for entry in fh: attr = entry.name.split('_') - barcode = attr[0] umi = attr[1] umis.append(umi) res = len(set(umis)) @@ -49,16 +45,14 @@ def get_umi_count(fq): @utils.add_log -def assemble_summary(outdir, sample, type): - - count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt' - UMIs = pd.read_csv(count_file, sep='\t') +def assemble_summary(outdir, sample, Seqtype): + # UMIs = pd.read_csv(count_file, sep='\t') stat_file = outdir + '/stat.txt' go_assemble_summary = [] - if type == 'TCR': + if Seqtype == 'TCR': TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') TRA_UMIs = [get_umi_count(fq) for fq in TRAs] @@ -74,19 +68,19 @@ def assemble_summary(outdir, sample, type): totals = TRA_UMIs_count + TRB_UMIs_count go_assemble_summary.append({ - 'item': f'All UMIs mapped to TRA and TRB', + 'item': 'All UMIs mapped to TRA and TRB', 'count': totals, 'total_count': np.nan, }) go_assemble_summary.append({ - 'item': f'UMIs mapped to TRA', + 'item': 'UMIs mapped to TRA', 'count': TRA_UMIs_count, 'total_count': totals, }) go_assemble_summary.append({ - 'item': f'UMIs mapped to TRB', + 'item': 'UMIs mapped to TRB', 'count': TRB_UMIs_count, 'total_count': totals, }) @@ -96,7 +90,7 @@ def assemble_summary(outdir, sample, type): f.write(f'Median TRA UMIs per cell:{medianA}\n') f.write(f'Median TRB UMIs per cell:{medianB}\n') - elif type == 'BCR': + elif Seqtype == 'BCR': IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') @@ -118,25 +112,25 @@ def assemble_summary(outdir, sample, type): totals = IGH + IGK + IGL go_assemble_summary.append({ - 'item': f'All UMIs mapped to IGH, IGL and IGK', + 'item': 'All UMIs mapped to IGH, IGL and IGK', 'count': totals, 'total_count': np.nan, }) go_assemble_summary.append({ - 'item': f'UMIs mapped to IGH', + 'item': 'UMIs mapped to IGH', 'count': IGH, 'total_count': totals, }) go_assemble_summary.append({ - 'item': f'UMIs mapped to IGK', + 'item': 'UMIs mapped to IGK', 'count': IGK, 'total_count': totals, }) go_assemble_summary.append({ - 'item': f'UMIs mapped to IGL', + 'item': 'UMIs mapped to IGL', 'count': IGL, 'total_count': totals, }) @@ -231,7 +225,7 @@ class Go_assemble(Step): def __init__(self, args, step_name): Step.__init__(self, args, step_name) self.species = args.species - self.type = args.type + self.Seqtype = args.Seqtype self.thread = int(args.thread) self.fastq_dir = args.fastq_dir @@ -251,7 +245,7 @@ class Go_assemble(Step): tracer_summarise(self.outdir) - assemble_summary(self.outdir, self.sample, self.type) + assemble_summary(self.outdir, self.sample, self.Seqtype) def run_bracer(self): @@ -268,13 +262,13 @@ class Go_assemble(Step): bracer_summarise(self.outdir) - assemble_summary(self.outdir, self.sample, self.type) + assemble_summary(self.outdir, self.sample, self.Seqtype) @utils.add_log def run(self): - if self.type == 'TCR': + if self.Seqtype == 'TCR': self.run_tracer() - elif self.type == 'BCR': + elif self.Seqtype == 'BCR': self.run_bracer() self.clean_up() @@ -291,6 +285,6 @@ def get_opts_go_assemble(parser, sub_program): if sub_program: parser = s_common(parser) parser.add_argument('--fastq_dir', required=True) - parser.add_argument('--type', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index ac11ce47..a0b7973e 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -1,8 +1,6 @@ import pysam from collections import defaultdict import os -import argparse -import datetime import pandas as pd from Bio.Seq import Seq import glob @@ -10,7 +8,7 @@ from celescope.tools import utils from celescope.tools.Step import Step, s_common -def get_barcodes(match_dir, type): +def get_barcodes(match_dir, Seqtype): """ get reversed barcodes VDJ barcodes and RNA barcodes are complementary and reversed @@ -21,9 +19,9 @@ def get_barcodes(match_dir, type): cluster_data = pd.read_csv(clusterFile, sep='\t') # filter barcodes - if type == 'TCR': + if Seqtype == 'TCR': clusters = cluster_data[cluster_data['cell_type'] == 'T cells']['cluster'].tolist() - elif type == 'BCR': + elif Seqtype == 'BCR': clusters = cluster_data[cluster_data['cell_type'] == 'B cells']['cluster'].tolist() tsne = glob.glob(f'{match_dir}/06.analysis/*_tsne_coord.tsv') @@ -66,7 +64,7 @@ class Split_fastq(Step): def __init__(self, args, step_name): Step.__init__(self, args, step_name) - self.type = args.type + self.Seqtype = args.Seqtype self.fq = args.fq self.match_dir = args.match_dir self.fq_outdir = f'{self.outdir}/fastq' @@ -90,7 +88,7 @@ class Split_fastq(Step): if not os.path.exists(self.fq_outdir): os.makedirs(self.fq_outdir) - barcodes = get_barcodes(self.match_dir, self.type) + barcodes = get_barcodes(self.match_dir, self.Seqtype) barcode_reads_dict = defaultdict(list) # reads from VDJ data for each barcode reads_count_dict = {} # reads count for each barcode @@ -161,7 +159,7 @@ def get_opts_split_fastq(parser, sub_program=True): parser = s_common(parser) parser.add_argument('--fq', required=True) parser.add_argument('--match_dir', help='matched rna_dir') - parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 532ee516..ae5eaad2 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -1,23 +1,16 @@ -import pysam -from collections import defaultdict import os -import argparse -import datetime import pandas as pd from Bio.Seq import Seq -import glob -import re import numpy as np from celescope.tools import utils from celescope.tools.Step import Step, s_common -import glob from celescope.tools.cellranger3 import get_plot_elements -import json from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat def tpm_count(ass_dir): - rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') # ass_dir outdir/sample/04.go_assemble + rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') + # ass_dir outdir/sample/04.go_assemble productive = rec[rec['productive'] == True] indx = list(productive.index) tpms = [] @@ -32,16 +25,16 @@ def tpm_count(ass_dir): tpm = float(line[4]) tpms.append(tpm) productive.insert(loc=productive.shape[1], column='TPM', value=tpms) - + return productive -def filtering(type, ass_dir, outdir): +def filtering(Seqtype, ass_dir, outdir): if not os.path.exists(outdir): os.makedirs(outdir) - if type == 'TCR': + if Seqtype == 'TCR': data = tpm_count(ass_dir) cell_name = set(list(data['cell_name'])) filtered = pd.DataFrame() @@ -60,7 +53,7 @@ def filtering(type, ass_dir, outdir): filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') - elif type == 'BCR': + elif Seqtype == 'BCR': data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') data = data[data['FUNCTIONAL'] == True] @@ -100,7 +93,7 @@ class Vdj_sum(Step): """ def __init__(self, args, step_name): Step.__init__(self, args, step_name) - self.type = args.type + self.Seqtype = args.Seqtype self.fastq_dir = args.fastq_dir self.ass_dir = args.ass_dir @@ -110,9 +103,9 @@ class Vdj_sum(Step): ass_dir = self.ass_dir outdir = self.outdir fastq_dir = self.fastq_dir - type = self.type + Seqtype = self.Seqtype - results = filtering(type, ass_dir, outdir) + results = filtering(Seqtype, ass_dir, outdir) stat_file = outdir + '/stat.txt' @@ -124,7 +117,7 @@ class Vdj_sum(Step): all_cells = count_umi.shape[0] - if type == 'TCR': + if Seqtype == 'TCR': productive_cells = set(results['cell_name'].tolist()) @@ -168,10 +161,10 @@ class Vdj_sum(Step): clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) - sum = clonetypes['Frequency'].sum() + sum_c = clonetypes['Frequency'].sum() proportions = [] for f in list(clonetypes['Frequency']): - p = f/sum + p = f/sum_c p = p * 100 p = round(p, 2) p = str(p) + '%' @@ -234,7 +227,7 @@ class Vdj_sum(Step): }) - elif type == 'BCR': + elif Seqtype == 'BCR': productive_cells = set(results['CELL'].tolist()) @@ -300,9 +293,9 @@ class Vdj_sum(Step): clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) Proportion = [] - sum = clonetypes['Frequency'].sum() + sum_c = clonetypes['Frequency'].sum() for f in list(clonetypes['Frequency']): - p = f/sum + p = f/sum_c p = p * 100 p = round(p, 2) p = str(p) + '%' @@ -424,7 +417,7 @@ def get_opts_vdj_sum(parser, sub_program): parser = s_common(parser) parser.add_argument('--ass_dir', help='assemble dir', required=True) parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) - parser.add_argument('--type', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) -- Gitee From e43bce47f2617081496a881f1a4990d6abf9cd4d Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Wed, 9 Jun 2021 16:08:27 +0800 Subject: [PATCH 33/96] pylint code --- celescope/tracer_vdj/go_assemble.py | 6 +- celescope/tracer_vdj/multi_tracer_vdj.py | 2 +- celescope/tracer_vdj/vdj_sum.py | 779 +++++++++++------------ 3 files changed, 385 insertions(+), 402 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 8bc63b18..06ae5371 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -45,7 +45,7 @@ def get_umi_count(fq): @utils.add_log -def assemble_summary(outdir, sample, Seqtype): +def assemble_summary(outdir, Seqtype): # UMIs = pd.read_csv(count_file, sep='\t') stat_file = outdir + '/stat.txt' @@ -245,7 +245,7 @@ class Go_assemble(Step): tracer_summarise(self.outdir) - assemble_summary(self.outdir, self.sample, self.Seqtype) + assemble_summary(self.outdir, self.Seqtype) def run_bracer(self): @@ -262,7 +262,7 @@ class Go_assemble(Step): bracer_summarise(self.outdir) - assemble_summary(self.outdir, self.sample, self.Seqtype) + assemble_summary(self.outdir, self.Seqtype) @utils.add_log def run(self): diff --git a/celescope/tracer_vdj/multi_tracer_vdj.py b/celescope/tracer_vdj/multi_tracer_vdj.py index 18ee9f98..5055ec9a 100755 --- a/celescope/tracer_vdj/multi_tracer_vdj.py +++ b/celescope/tracer_vdj/multi_tracer_vdj.py @@ -1,4 +1,4 @@ -from celescope.tracer_vdj.__init__ import __STEPS__, __ASSAY__ +from celescope.tracer_vdj.__init__ import __ASSAY__ from celescope.tools.Multi import Multi diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index ae5eaad2..60d86d46 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -5,419 +5,402 @@ import numpy as np from celescope.tools import utils from celescope.tools.Step import Step, s_common from celescope.tools.cellranger3 import get_plot_elements -from celescope.tracer_vdj.go_assemble import percent_str_func, gen_stat def tpm_count(ass_dir): - rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') - # ass_dir outdir/sample/04.go_assemble - productive = rec[rec['productive'] == True] - indx = list(productive.index) - tpms = [] - for i in indx: - cell_name = productive.loc[i, 'cell_name'] - rec_id = productive.loc[i, 'recombinant_id'] - with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf: - for line in tsvf: - if rec_id in line: - line = line.rstrip() - line = line.split('\t') - tpm = float(line[4]) - tpms.append(tpm) - productive.insert(loc=productive.shape[1], column='TPM', value=tpms) - - return productive + rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') + # ass_dir outdir/sample/04.go_assemble + productive = rec[rec['productive'] == True] + indx = list(productive.index) + tpms = [] + for i in indx: + cell_name = productive.loc[i, 'cell_name'] + rec_id = productive.loc[i, 'recombinant_id'] + with open(f'{ass_dir}/tracer/{cell_name}/expression_quantification/abundance.tsv') as tsvf: + for line in tsvf: + if rec_id in line: + line = line.rstrip() + line = line.split('\t') + tpm = float(line[4]) + tpms.append(tpm) + productive.insert(loc=productive.shape[1], column='TPM', value=tpms) + + return productive def filtering(Seqtype, ass_dir, outdir): - - if not os.path.exists(outdir): - os.makedirs(outdir) - - if Seqtype == 'TCR': - data = tpm_count(ass_dir) - cell_name = set(list(data['cell_name'])) - filtered = pd.DataFrame() - for name in cell_name: - count_data = data[data['cell_name'] == name] - tra = count_data[count_data['locus'] == 'A'] - trb = count_data[count_data['locus'] == 'B'] - if tra.empty is not True: - tra = tra.sort_values(by='TPM', ascending=False) - tra = tra.head(1) - filtered = filtered.append(tra, ignore_index=True) - if trb.empty is not True: - trb = trb.sort_values(by='TPM', ascending=False) - trb = trb.head(1) - filtered = filtered.append(trb, ignore_index=True) - - filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') - - elif Seqtype == 'BCR': - - data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') - data = data[data['FUNCTIONAL'] == True] - cell_name = set(list(data['CELL'])) - filtered = pd.DataFrame() - for name in cell_name: - count_cell = data[data['CELL'] == name] - count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H']) - count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K']) - count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L']) - count_k_l = count_k.append(count_l) - if count_h.empty is not True: - count_h = count_h.sort_values(by='TPM', ascending=False) - count_h = count_h.head(1) - filtered = filtered.append(count_h, ignore_index=True) - if count_k_l.empty is not True: - count_k_l = count_k_l.sort_values(by='TPM', ascending=False) - count_k_l = count_k_l.head(1) - filtered = filtered.append(count_k_l, ignore_index=True) - - filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') - - return filtered - + if not os.path.exists(outdir): + os.makedirs(outdir) + + if Seqtype == 'TCR': + data = tpm_count(ass_dir) + cell_name = set(list(data['cell_name'])) + filtered = pd.DataFrame() + for name in cell_name: + count_data = data[data['cell_name'] == name] + tra = count_data[count_data['locus'] == 'A'] + trb = count_data[count_data['locus'] == 'B'] + if tra.empty is not True: + tra = tra.sort_values(by='TPM', ascending=False) + tra = tra.head(1) + filtered = filtered.append(tra, ignore_index=True) + if trb.empty is not True: + trb = trb.sort_values(by='TPM', ascending=False) + trb = trb.head(1) + filtered = filtered.append(trb, ignore_index=True) + + filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') + + elif Seqtype == 'BCR': + + data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') + data = data[data['FUNCTIONAL'] == True] + cell_name = set(list(data['CELL'])) + filtered = pd.DataFrame() + for name in cell_name: + count_cell = data[data['CELL'] == name] + count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H']) + count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K']) + count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L']) + count_k_l = count_k.append(count_l) + if count_h.empty is not True: + count_h = count_h.sort_values(by='TPM', ascending=False) + count_h = count_h.head(1) + filtered = filtered.append(count_h, ignore_index=True) + if count_k_l.empty is not True: + count_k_l = count_k_l.sort_values(by='TPM', ascending=False) + count_k_l = count_k_l.head(1) + filtered = filtered.append(count_k_l, ignore_index=True) + + filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') + + return filtered + class Vdj_sum(Step): - """ - Features - - - Filter tracer results by TPM. - - Calculate clonetypes. - - Output - - - `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus. - - `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion. - """ - def __init__(self, args, step_name): - Step.__init__(self, args, step_name) - self.Seqtype = args.Seqtype - self.fastq_dir = args.fastq_dir - self.ass_dir = args.ass_dir - - - @utils.add_log - def run(self): - ass_dir = self.ass_dir - outdir = self.outdir - fastq_dir = self.fastq_dir - Seqtype = self.Seqtype - - results = filtering(Seqtype, ass_dir, outdir) - - stat_file = outdir + '/stat.txt' - - vdj_sum_summary = [] - - count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt' - - count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) - - all_cells = count_umi.shape[0] - - if Seqtype == 'TCR': - - productive_cells = set(results['cell_name'].tolist()) - - count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") - - count_umi.to_csv(count_umi_file, sep='\t') - - self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - - productive_cells_num = len(productive_cells) - - TRA_chain = results[results['locus'] == 'A'] - TRA_chain_num = TRA_chain.shape[0] - TRB_chain = results[results['locus'] == 'B'] - TRB_chain_num = TRB_chain.shape[0] - - TRAs, TRBs = [], [] - paired_cell = 0 - for cell in productive_cells: - tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] - if tmp1.empty is not True: - chainA = tmp1['CDR3aa'].tolist()[0] - TRAs.append(chainA) - else: - TRAs.append('NaN') - - tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] - if tmp2.empty is not True: - chainB = tmp2['CDR3aa'].tolist()[0] - TRBs.append(chainB) - else: - TRBs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_cell += 1 - - clonetypes_table = pd.DataFrame() - clonetypes_table['TRA_chain'] = TRAs - clonetypes_table['TRB_chain'] = TRBs - clonetypes_table['Frequency'] = '' - - clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) - - sum_c = clonetypes['Frequency'].sum() - proportions = [] - for f in list(clonetypes['Frequency']): - p = f/sum_c - p = p * 100 - p = round(p, 2) - p = str(p) + '%' - proportions.append(p) - clonetypes['Proportion'] = proportions - clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) - clonetypes = clonetypes.reset_index() - - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) - - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - - vdj_sum_summary.append({ - 'item': 'Estimated Number of Cells', - 'count': productive_cells_num, - 'total_count': all_cells, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with TRA', - 'count': TRA_chain_num, - 'total_count': productive_cells_num, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with TRB', - 'count': TRB_chain_num, - 'total_count': productive_cells_num, - }) - - vdj_sum_summary.append({ - 'item': 'Cells with paired TRA and TRB', - 'count': paired_cell, - 'total_count': productive_cells_num, - }) - - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians = [] - for line in f: - line = line.rstrip('\n').split(':') - medians.append(int(line[1])) - - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median TRA UMIs per cell', - 'count': medians[1], - 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median TRB UMIs per cell', - 'count': medians[2], - 'total_count': np.nan - }) - - - elif Seqtype == 'BCR': - - productive_cells = set(results['CELL'].tolist()) - - productive_cells_num = len(productive_cells) - - count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") - - count_umi.to_csv(count_umi_file, sep='\t') - - self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - - results_h = results[results['LOCUS'] == 'H'] - results_k = results[results['LOCUS'] == 'K'] - results_l = results[results['LOCUS'] == 'L'] - results_h_count = results_h.shape[0] - results_k_count = results_k.shape[0] - results_l_count = results_l.shape[0] - - IGHs, IGKs, IGLs = [], [], [] - - paired_k, paired_l = 0, 0 - - for cell in productive_cells: - tmp1 = results_h[results_h['CELL'] == cell] - if tmp1.empty is not True: - seq = tmp1['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGHs.append(aaseq) - else: - IGHs.append('NaN') - - tmp2 = results_l[results_l['CELL'] == cell] - if tmp2.empty is not True: - seq = tmp2['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGLs.append(aaseq) - else: - IGLs.append('NaN') - - tmp3 = results_k[results_k['CELL'] == cell] - if tmp3.empty is not True: - seq = tmp3['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGKs.append(aaseq) - else: - IGKs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_l += 1 - if not tmp1.empty and not tmp3.empty: - paired_k += 1 - - clonetypes_table = pd.DataFrame() - - clonetypes_table['IGH_chain'] = IGHs - clonetypes_table['IGL_chain'] = IGLs - clonetypes_table['IGK_chain'] = IGKs - clonetypes_table['Frequency'] = '' - - clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) - - Proportion = [] - sum_c = clonetypes['Frequency'].sum() - for f in list(clonetypes['Frequency']): - p = f/sum_c - p = p * 100 - p = round(p, 2) - p = str(p) + '%' - Proportion.append(p) - clonetypes['Proportion'] = Proportion - clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) - clonetypes = clonetypes.reset_index() - - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') - - - vdj_sum_summary.append({ - 'item': 'Estimated Number of Cells', - 'count': productive_cells_num, - 'total_count': all_cells - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGH', - 'count': results_h_count, - 'total_count': productive_cells_num - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGK', - 'count': results_k_count, - 'total_count': productive_cells_num - }) - - vdj_sum_summary.append({ - 'item': 'Cells with IGL', - 'count': results_l_count, - 'total_count': productive_cells_num - }) - - vdj_sum_summary.append({ - 'item': 'Cells with paired IGH and IGK', - 'count': paired_k, - 'total_count': productive_cells_num - }) - - vdj_sum_summary.append({ - 'item': 'Cells with paired IGH and IGL', - 'count': paired_l, - 'total_count': all_cells - }) - - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians=[] - for line in f: - line = line.strip('\n').split(':') - medians.append(int(line[1])) - - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median IGH UMIs per cell', - 'count': medians[1], - 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median IGK UMIs per cell', - 'count': medians[2], - 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median IGL UMIs per cell', - 'count': medians[3], - 'total_count': np.nan - }) - - df = pd.DataFrame(vdj_sum_summary, - columns=['item', 'count', 'total_count']) - - df['count'] = df['count'].apply(int) - - df['percent'] = df['count']/(df.total_count.astype('float')) * 100 - - df['percent'] = df['percent'].apply( - lambda x: round(x, 2) - ) - df['count'] = df['count'].apply(utils.format_number) - - df['percent_str'] = df.apply( - lambda row: percent_str_func(row), axis=1 - ) - - gen_stat(df, stat_file) - - # clonetype table - - title = 'Clonetypes' - table_dict = self.get_table(title, 'clonetypes_table', clonetypes) - - self.add_data_item(table_dict=table_dict) - - self.clean_up() - - os.remove(f'{ass_dir}/tmp.txt') + """ + Features + + - Filter tracer results by TPM. + - Calculate clonetypes. + + Output + + - `05.vdj_sum/filtered.txt` Filtered results of tracer. Each cell has unique chain for each locus. + - `05.vdj_sum/clonetypes.txt` Clonetypes calculation. 5 (TCR) or 6 (BCR) columns, clonetypeId, (detailed clonetypes), frequency, proportion. + """ + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + self.Seqtype = args.Seqtype + self.fastq_dir = args.fastq_dir + self.ass_dir = args.ass_dir + + + @utils.add_log + def run(self): + ass_dir = self.ass_dir + outdir = self.outdir + fastq_dir = self.fastq_dir + Seqtype = self.Seqtype + + results = filtering(Seqtype, ass_dir, outdir) + + stat_file = outdir + '/stat.txt' + + vdj_sum_summary = [] + + count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt' + + count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) + + if Seqtype == 'TCR': + + productive_cells = set(results['cell_name'].tolist()) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') + + self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) + + productive_cells_num = len(productive_cells) + + TRA_chain = results[results['locus'] == 'A'] + TRA_chain_num = TRA_chain.shape[0] + TRB_chain = results[results['locus'] == 'B'] + TRB_chain_num = TRB_chain.shape[0] + + TRAs, TRBs = [], [] + paired_cell = 0 + for cell in productive_cells: + tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] + if tmp1.empty is not True: + chainA = tmp1['CDR3aa'].tolist()[0] + TRAs.append(chainA) + else: + TRAs.append('NaN') + + tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] + if tmp2.empty is not True: + chainB = tmp2['CDR3aa'].tolist()[0] + TRBs.append(chainB) + else: + TRBs.append('NaN') + + if not tmp1.empty and not tmp2.empty: + paired_cell += 1 + + clonetypes_table = pd.DataFrame() + clonetypes_table['TRA_chain'] = TRAs + clonetypes_table['TRB_chain'] = TRBs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) + + sum_c = clonetypes['Frequency'].sum() + proportions = [] + for f in list(clonetypes['Frequency']): + p = f/sum_c + p = p * 100 + p = round(p, 2) + p = str(p) + '%' + proportions.append(p) + clonetypes['Proportion'] = proportions + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) + clonetypes = clonetypes.reset_index() + + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + + vdj_sum_summary.append({ + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': np.nan, + }) + + vdj_sum_summary.append({ + 'item': 'Cells with TRA', + 'count': TRA_chain_num, + 'total_count': productive_cells_num, + }) + + vdj_sum_summary.append({ + 'item': 'Cells with TRB', + 'count': TRB_chain_num, + 'total_count': productive_cells_num, + }) + + vdj_sum_summary.append({ + 'item': 'Cells with paired TRA and TRB', + 'count': paired_cell, + 'total_count': productive_cells_num, + }) + + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians = [] + for line in f: + line = line.rstrip('\n').split(':') + medians.append(int(line[1])) + + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRA UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRB UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) + + + elif Seqtype == 'BCR': + + productive_cells = set(results['CELL'].tolist()) + + productive_cells_num = len(productive_cells) + + count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") + + count_umi.to_csv(count_umi_file, sep='\t') + + self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) + + results_h = results[results['LOCUS'] == 'H'] + results_k = results[results['LOCUS'] == 'K'] + results_l = results[results['LOCUS'] == 'L'] + results_h_count = results_h.shape[0] + results_k_count = results_k.shape[0] + results_l_count = results_l.shape[0] + + IGHs, IGKs, IGLs = [], [], [] + + paired_k, paired_l = 0, 0 + + for cell in productive_cells: + tmp1 = results_h[results_h['CELL'] == cell] + if tmp1.empty is not True: + seq = tmp1['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGHs.append(aaseq) + else: + IGHs.append('NaN') + + tmp2 = results_l[results_l['CELL'] == cell] + if tmp2.empty is not True: + seq = tmp2['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGLs.append(aaseq) + else: + IGLs.append('NaN') + + tmp3 = results_k[results_k['CELL'] == cell] + if tmp3.empty is not True: + seq = tmp3['JUNCTION'].tolist()[0] + seq = Seq(seq) + aaseq = seq.translate() + IGKs.append(aaseq) + else: + IGKs.append('NaN') + + if not tmp1.empty and not tmp2.empty: + paired_l += 1 + if not tmp1.empty and not tmp3.empty: + paired_k += 1 + + clonetypes_table = pd.DataFrame() + + clonetypes_table['IGH_chain'] = IGHs + clonetypes_table['IGL_chain'] = IGLs + clonetypes_table['IGK_chain'] = IGKs + clonetypes_table['Frequency'] = '' + + clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) + + Proportion = [] + sum_c = clonetypes['Frequency'].sum() + for f in list(clonetypes['Frequency']): + p = f/sum_c + p = p * 100 + p = round(p, 2) + p = str(p) + '%' + Proportion.append(p) + clonetypes['Proportion'] = Proportion + clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) + clonetypes = clonetypes.reset_index() + + clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + + + vdj_sum_summary.append({ + 'item': 'Estimated Number of Cells', + 'count': productive_cells_num, + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGH', + 'count': results_h_count, + 'total_count': productive_cells_num + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGK', + 'count': results_k_count, + 'total_count': productive_cells_num + }) + + vdj_sum_summary.append({ + 'item': 'Cells with IGL', + 'count': results_l_count, + 'total_count': productive_cells_num + }) + + vdj_sum_summary.append({ + 'item': 'Cells with paired IGH and IGK', + 'count': paired_k, + 'total_count': productive_cells_num + }) + + vdj_sum_summary.append({ + 'item': 'Cells with paired IGH and IGL', + 'count': paired_l, + 'total_count': productive_cells_num + }) + + with open(f'{ass_dir}/tmp.txt', 'r') as f: + medians=[] + for line in f: + line = line.strip('\n').split(':') + medians.append(int(line[1])) + + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': medians[0], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGH UMIs per cell', + 'count': medians[1], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGK UMIs per cell', + 'count': medians[2], + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGL UMIs per cell', + 'count': medians[3], + 'total_count': np.nan + }) + + df = pd.DataFrame(vdj_sum_summary, + columns=['item', 'count', 'total_count']) + + utils.gen_stat(df, stat_file) + + # clonetype table + + title = 'Clonetypes' + table_dict = self.get_table(title, 'clonetypes_table', clonetypes) + + self.add_data_item(table_dict=table_dict) + + self.clean_up() + + os.remove(f'{ass_dir}/tmp.txt') @utils.add_log def vdj_sum(args): - step_name = 'vdj_sum' - vdj_sum_obj = Vdj_sum(args, step_name) - vdj_sum_obj.run() + step_name = 'vdj_sum' + vdj_sum_obj = Vdj_sum(args, step_name) + vdj_sum_obj.run() def get_opts_vdj_sum(parser, sub_program): - if sub_program: - parser = s_common(parser) - parser.add_argument('--ass_dir', help='assemble dir', required=True) - parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) - parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + if sub_program: + parser = s_common(parser) + parser.add_argument('--ass_dir', help='assemble dir', required=True) + parser.add_argument('--fastq_dir', help='dir contains fastq', required=True) + parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) -- Gitee From 06c908d2277f597367921e8f905699f84f3abe19 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Wed, 9 Jun 2021 18:38:52 +0800 Subject: [PATCH 34/96] rewrite map count by bowtie2 --- celescope/tracer_vdj/go_assemble.py | 152 +++++++++++++--------------- 1 file changed, 69 insertions(+), 83 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 06ae5371..f521a8c0 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -45,101 +45,87 @@ def get_umi_count(fq): @utils.add_log -def assemble_summary(outdir, Seqtype): - # UMIs = pd.read_csv(count_file, sep='\t') +def assemble_summary(outdir, Seqtype, sample, species): stat_file = outdir + '/stat.txt' go_assemble_summary = [] - if Seqtype == 'TCR': - TRAs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_A.fastq') - TRBs = glob.glob(f'{outdir}/tracer/*/aligned_reads/*_TCR_B.fastq') - TRA_UMIs = [get_umi_count(fq) for fq in TRAs] - TRB_UMIs = [get_umi_count(fq) for fq in TRBs] - TRA_UMIs_count = sum(TRA_UMIs) - medianA = int(np.median(TRA_UMIs)) - TRB_UMIs_count = sum(TRB_UMIs) - medianB = int(np.median(TRB_UMIs)) - - all_umi_count = TRA_UMIs + TRB_UMIs - medianAll = int(np.median(all_umi_count)) - - totals = TRA_UMIs_count + TRB_UMIs_count - - go_assemble_summary.append({ - 'item': 'All UMIs mapped to TRA and TRB', - 'count': totals, - 'total_count': np.nan, - }) - - go_assemble_summary.append({ - 'item': 'UMIs mapped to TRA', - 'count': TRA_UMIs_count, - 'total_count': totals, - }) - - go_assemble_summary.append({ - 'item': 'UMIs mapped to TRB', - 'count': TRB_UMIs_count, - 'total_count': totals, - }) + clean_fq = f'{outdir}/../02.cutadapt/{sample}__clean_2.fq' - with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Madian UMIs per cell:{medianAll}\n') - f.write(f'Median TRA UMIs per cell:{medianA}\n') - f.write(f'Median TRB UMIs per cell:{medianB}\n') + count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt' - elif Seqtype == 'BCR': - IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') - IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') - IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') - - IGH_UMIs = [get_umi_count(fq) for fq in IGHs] - IGK_UMIs = [get_umi_count(fq) for fq in IGKs] - IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - - all_umi_count = IGH_UMIs + IGL_UMIs + IGK_UMIs - medianAll = int(np.median(all_umi_count)) - - IGH = sum(IGH_UMIs) - medianH = int(np.median(IGH_UMIs)) - IGK = sum(IGK_UMIs) - medianK = int(np.median(IGK_UMIs)) - IGL = sum(IGL_UMIs) - medianL = int(np.median(IGL_UMIs)) - - totals = IGH + IGK + IGL - - go_assemble_summary.append({ - 'item': 'All UMIs mapped to IGH, IGL and IGK', - 'count': totals, - 'total_count': np.nan, - }) + count_df = pd.read_csv(count_file, sep='\t') - go_assemble_summary.append({ - 'item': 'UMIs mapped to IGH', - 'count': IGH, - 'total_count': totals, - }) + total_count = count_df['readcount'].sum() - go_assemble_summary.append({ - 'item': 'UMIs mapped to IGK', - 'count': IGK, - 'total_count': totals, + if Seqtype == 'TCR': + loci = ['A', 'B'] + + total_mapped = 0 + + for locus in loci: + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} ' + f'-U {clean_fq} ' + f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + with open(f'{outdir}/log') as fh: + for line in fh: + if 'aligned exactly 1 time' in line: + res = re.findall("\d+", line) + item = f'Reads mapped to TR{locus}' + count = int(res[0]) + total_mapped += count + go_assemble_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count, + }) + + os.system(f'rm {outdir}/TR{locus}.sam') + + go_assemble_summary.insert(0, { + 'item': 'All reads Mapped to TRA and TRB', + 'count': total_mapped, + 'total_count': total_count }) - go_assemble_summary.append({ - 'item': 'UMIs mapped to IGL', - 'count': IGL, - 'total_count': totals, + elif Seqtype == 'BCR': + loci = ['H', 'L', 'K'] + + total_mapped = 0 + + for locus in loci: + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/bracer/resources/{species}/combinatorial_recombinomes/BCR_{locus} ' + f'-U {clean_fq} ' + f'-S {outdir}/BR{locus}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + with open(f'{outdir}/log') as fh: + for line in fh: + if 'aligned exactly 1 time' in line: + res = re.findall("\d+", line) + item = f'Reads mapped to BR{locus}' + count = int(res[0]) + total_mapped += count + go_assemble_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count, + }) + os.system(f'rm {outdir}/BR{locus}.sam') + go_assemble_summary.insert(0, { + 'item': 'All reads Mapped to IGH, IGL and IGK', + 'count': total_mapped, + 'total_count': total_count }) - - with open(f'{outdir}/tmp.txt', 'w') as f: - f.write(f'Median UMIs per cell:{medianAll}\n') - f.write(f'Median IGH UMIs per Cell:{medianH}\n') - f.write(f'Median IGK UMIs per Cell:{medianK}\n') - f.write(f'Median IGL UMIs per Cell:{medianL}\n') df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) -- Gitee From b57d7533ad84a55aab1b24e9101f9f36787091a7 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 10 Jun 2021 15:39:22 +0800 Subject: [PATCH 35/96] add bowtie2 map --- celescope/tracer_vdj/go_assemble.py | 44 +++-------- celescope/tracer_vdj/split_fastq.py | 2 +- celescope/tracer_vdj/vdj_sum.py | 116 ++++++++++++++++------------ 3 files changed, 78 insertions(+), 84 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index f521a8c0..452eb54d 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -5,9 +5,6 @@ from os import listdir from os.path import isfile, join from concurrent.futures import ProcessPoolExecutor from celescope.tools import utils -import glob -import pysam -import numpy as np from celescope.tools.Step import Step, s_common @@ -18,32 +15,6 @@ BRACER_CONDA = 'bracer' BRACER_CONF = '/SGRNJ03/randd/zhouxin/software/bracer/bracer.conf' -def gen_stat(summary, stat_file): - stat = summary - stat["new_count"] = stat["count"].astype(str) + stat["percent_str"] - stat = stat.loc[:, ["item", "new_count"]] - stat.to_csv(stat_file, sep=":", header=None, index=False) - - -def percent_str_func(row): - need_percent = bool(re.search("Cells with", row["item"], flags=re.IGNORECASE)) - if need_percent: - return "(" + str(row["percent"]) + "%)" - else: - return "" - - -def get_umi_count(fq): - umis = [] - with pysam.FastxFile(fq) as fh: - for entry in fh: - attr = entry.name.split('_') - umi = attr[1] - umis.append(umi) - res = len(set(umis)) - return res - - @utils.add_log def assemble_summary(outdir, Seqtype, sample, species): @@ -51,7 +22,7 @@ def assemble_summary(outdir, Seqtype, sample, species): go_assemble_summary = [] - clean_fq = f'{outdir}/../02.cutadapt/{sample}__clean_2.fq' + clean_fq = f'{outdir}/../02.cutadapt/{sample}_clean_2.fq' count_file = f'{outdir}/../03.split_fastq/{sample}_count.txt' @@ -76,7 +47,7 @@ def assemble_summary(outdir, Seqtype, sample, species): with open(f'{outdir}/log') as fh: for line in fh: if 'aligned exactly 1 time' in line: - res = re.findall("\d+", line) + res = re.findall(r"\d+", line) item = f'Reads mapped to TR{locus}' count = int(res[0]) total_mapped += count @@ -94,6 +65,8 @@ def assemble_summary(outdir, Seqtype, sample, species): 'total_count': total_count }) + os.system(f'rm {outdir}/log') + elif Seqtype == 'BCR': loci = ['H', 'L', 'K'] @@ -111,7 +84,7 @@ def assemble_summary(outdir, Seqtype, sample, species): with open(f'{outdir}/log') as fh: for line in fh: if 'aligned exactly 1 time' in line: - res = re.findall("\d+", line) + res = re.findall(r"\d+", line) item = f'Reads mapped to BR{locus}' count = int(res[0]) total_mapped += count @@ -126,7 +99,8 @@ def assemble_summary(outdir, Seqtype, sample, species): 'count': total_mapped, 'total_count': total_count }) - + os.system(f'rm {outdir}/log') + df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) utils.gen_stat(df, stat_file) @@ -231,7 +205,7 @@ class Go_assemble(Step): tracer_summarise(self.outdir) - assemble_summary(self.outdir, self.Seqtype) + assemble_summary(self.outdir, self.Seqtype, self.sample, self.species) def run_bracer(self): @@ -248,7 +222,7 @@ class Go_assemble(Step): bracer_summarise(self.outdir) - assemble_summary(self.outdir, self.Seqtype) + assemble_summary(self.outdir, self.Seqtype, self.sample, self.species) @utils.add_log def run(self): diff --git a/celescope/tracer_vdj/split_fastq.py b/celescope/tracer_vdj/split_fastq.py index a0b7973e..81d372d1 100755 --- a/celescope/tracer_vdj/split_fastq.py +++ b/celescope/tracer_vdj/split_fastq.py @@ -140,7 +140,7 @@ class Split_fastq(Step): i += 1 df_f['cell_name'].fillna(0, inplace=True) - + df_f.fillna(0, inplace=True) df_f = df_f.astype(int) df_f.to_csv(self.count_file, sep='\t') diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 60d86d46..d3896db0 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -5,6 +5,19 @@ import numpy as np from celescope.tools import utils from celescope.tools.Step import Step, s_common from celescope.tools.cellranger3 import get_plot_elements +import glob +import pysam + + +def get_umi_count(fq): + umis = [] + with pysam.FastxFile(fq) as fh: + for entry in fh: + attr = entry.name.split('_') + umi = attr[1] + umis.append(umi) + res = len(set(umis)) + return res def tpm_count(ass_dir): @@ -113,6 +126,8 @@ class Vdj_sum(Step): count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) + median_all = int(count_umi['UMI'].median()) + if Seqtype == 'TCR': productive_cells = set(results['cell_name'].tolist()) @@ -198,29 +213,31 @@ class Vdj_sum(Step): 'total_count': productive_cells_num, }) - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians = [] - for line in f: - line = line.rstrip('\n').split(':') - medians.append(int(line[1])) + TRAs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_A.fastq') + TRBs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_B.fastq') + TRA_UMIs = [get_umi_count(fq) for fq in TRAs] + TRB_UMIs = [get_umi_count(fq) for fq in TRBs] - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan - }) + medianA = int(np.median(TRA_UMIs)) + medianB = int(np.median(TRB_UMIs)) - vdj_sum_summary.append({ - 'item': 'Median TRA UMIs per cell', - 'count': medians[1], - 'total_count': np.nan - }) + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': median_all, + 'total_count': np.nan + }) - vdj_sum_summary.append({ - 'item': 'Median TRB UMIs per cell', - 'count': medians[2], - 'total_count': np.nan - }) + vdj_sum_summary.append({ + 'item': 'Median TRA UMIs per cell', + 'count': medianA, + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median TRB UMIs per cell', + 'count': medianB, + 'total_count': np.nan + }) elif Seqtype == 'BCR': @@ -341,35 +358,41 @@ class Vdj_sum(Step): 'total_count': productive_cells_num }) - with open(f'{ass_dir}/tmp.txt', 'r') as f: - medians=[] - for line in f: - line = line.strip('\n').split(':') - medians.append(int(line[1])) + IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') + IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') + IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': medians[0], - 'total_count': np.nan - }) + IGH_UMIs = [get_umi_count(fq) for fq in IGHs] + IGK_UMIs = [get_umi_count(fq) for fq in IGKs] + IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - vdj_sum_summary.append({ - 'item': 'Median IGH UMIs per cell', - 'count': medians[1], - 'total_count': np.nan - }) + medianH = int(np.median(IGH_UMIs)) + medianL = int(np.median(IGL_UMIs)) + medianK = int(np.median(IGK_UMIs)) - vdj_sum_summary.append({ - 'item': 'Median IGK UMIs per cell', - 'count': medians[2], - 'total_count': np.nan - }) + vdj_sum_summary.append({ + 'item': 'Median UMIs per cell', + 'count': median_all, + 'total_count': np.nan + }) - vdj_sum_summary.append({ - 'item': 'Median IGL UMIs per cell', - 'count': medians[3], - 'total_count': np.nan - }) + vdj_sum_summary.append({ + 'item': 'Median IGH UMIs per cell', + 'count': medianH, + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGL UMIs per cell', + 'count': medianL, + 'total_count': np.nan + }) + + vdj_sum_summary.append({ + 'item': 'Median IGK UMIs per cell', + 'count': medianK, + 'total_count': np.nan + }) df = pd.DataFrame(vdj_sum_summary, columns=['item', 'count', 'total_count']) @@ -385,9 +408,6 @@ class Vdj_sum(Step): self.clean_up() - os.remove(f'{ass_dir}/tmp.txt') - - @utils.add_log def vdj_sum(args): step_name = 'vdj_sum' -- Gitee From 5a1b96f42e19d8718f63432e42cbb5ac0a33fe43 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 10 Jun 2021 19:33:54 +0800 Subject: [PATCH 36/96] add trust assemble --- celescope/__init__.py | 3 +- celescope/templates/html/trust_vdj/base.html | 161 +++++++++++++++++++ celescope/trust_vdj/__init__.py | 6 + celescope/trust_vdj/multi_trust_vdj.py | 35 ++++ celescope/trust_vdj/res_filter.py | 87 ++++++++++ celescope/trust_vdj/trust_assemble.py | 130 +++++++++++++++ 6 files changed, 421 insertions(+), 1 deletion(-) create mode 100644 celescope/templates/html/trust_vdj/base.html create mode 100644 celescope/trust_vdj/__init__.py create mode 100644 celescope/trust_vdj/multi_trust_vdj.py create mode 100644 celescope/trust_vdj/res_filter.py create mode 100644 celescope/trust_vdj/trust_assemble.py diff --git a/celescope/__init__.py b/celescope/__init__.py index 015f41d1..d1f25d8b 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -14,5 +14,6 @@ ASSAY_DICT = { 'tag': 'Single Cell tag', 'citeseq': 'Single Cell CITE-Seq', 'tcr_fl': 'Single Cell full length TCR', - 'tracer_vdj': 'Single Cell Full Length TCR or BCR' + 'tracer_vdj': 'Single Cell Full Length vdj assemble', + 'trust_vdj': 'Single Cell Full Length vdj assemble' } diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html new file mode 100644 index 00000000..5318bb34 --- /dev/null +++ b/celescope/templates/html/trust_vdj/base.html @@ -0,0 +1,161 @@ + + + + + + report + + + + + + + + + + + + +
+

CeleScope Report

+ + {% if sample_summary is defined %} + {% include "html/common/sample_summary.html"%} + {% endif %} + + {% if barcode_summary is defined %} + {% include "html/common/barcode_summary.html"%} + {% endif %} + + {% if cutadapt_summary is defined %} + {% include "html/common/cutadapt_summary.html"%} + {% endif %} + + {% if split_fastq is defined %} + {% include "html/tracer_vdj/split_fastq_summary.html"%} + {% endif %} + + {% if go_assemble_summary is defined %} + {% include "html/tracer_vdj/go_assemble_summary.html"%} + {% endif %} + + {% if vdj_sum_summary is defined %} + {% include "html/tracer_vdj/vdj_sum_summary.html"%} + {% endif %} + + {% if table_dict is defined %} + {% include "html/vdj/clonetypes_table.html"%} + {% endif %} + + +
+ + + + \ No newline at end of file diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py new file mode 100644 index 00000000..69aeb6f7 --- /dev/null +++ b/celescope/trust_vdj/__init__.py @@ -0,0 +1,6 @@ +__STEPS__ = [ + 'sample', + 'barcode', + 'trust_assemble', + 'res_filter'] +__ASSAY__ = 'trust_vdj' diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py new file mode 100644 index 00000000..93ecdedb --- /dev/null +++ b/celescope/trust_vdj/multi_trust_vdj.py @@ -0,0 +1,35 @@ +from celescope.trust_vdj.__init__ import __ASSAY__ +from celescope.tools.Multi import Multi + + +class Multi_trust_vdj(Multi): + + def trust_assemble(self, sample): + step = 'trust_assemble' + cmd_line = self.get_cmd_line(step, sample) + fq1 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R1.fq{self.fq_suffix}' + fq2 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R2.fq{self.fq_suffix}' + cmd = ( + f'{cmd_line} ' + f'--fq1 {fq1} ' + f'--fq2 {fq2} ' + f'--match_dir {self.col4_dict[sample]}' + ) + self.process_cmd(cmd, step, sample, m=15, x=self.args.thread) + + + def res_filter(self, sample): + step = 'res_filter' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=1) + + +def main(): + multi = Multi_trust_vdj(__ASSAY__) + multi.run() + +if __name__ == '__main__': + main() diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py new file mode 100644 index 00000000..ce46c374 --- /dev/null +++ b/celescope/trust_vdj/res_filter.py @@ -0,0 +1,87 @@ +import pandas as pd +from celescope.tools.Step import Step, s_common +from celescope.tools import utils + + +@utils.add_log +def beauty_res(outdir, barcode_report): + res = pd.read_csv(barcode_report, sep='\t') + rows = res.shape[0] + loci = ['A', 'B'] + chians = ['chain2', 'chain1'] + for l in range(len(loci)): + chain = chians[l] + locus = loci[l] + + Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], [] + + for i in range(rows): + attr = res.loc[i, chain] + attrs = attr.split(',') + if len(attrs) == 10: + V, D, J, C, cdr3nt, cdr3aa, readcount, fl = attrs[0], attrs[1], attrs[2], attrs[3], attrs[4], attrs[5], attrs[6], attrs[-1] + Vgenes.append(V) + Dgenes.append(D) + Jgenes.append(J) + Cgenes.append(C) + cdr3nts.append(cdr3nt) + cdr3aas.append(cdr3aa) + readcounts.append(readcount) + fuls.append(fl) + elif len(attrs) != 10: + Vgenes.append('NAN') + Dgenes.append('NAN') + Jgenes.append('NAN') + Cgenes.append('NAN') + cdr3nts.append('NAN') + cdr3aas.append('NAN') + readcounts.append('NAN') + fuls.append('NAN') + + res[f'TR{locus}_V'] = Vgenes + res[f'TR{locus}_D'] = Dgenes + res[f'TR{locus}_J'] = Jgenes + res[f'TR{locus}_C'] = Cgenes + res[f'TR{locus}_cdr3nt'] = cdr3nts + res[f'TR{locus}_cdr3aa'] = cdr3aas + res[f'TR{locus}_readcount'] = readcounts + res[f'TR{locus}_fl'] = fuls + + res.to_csv(f'{outdir}/new_barcode_report.tsv', sep='\t') + + return res + + +class Res_filter(Step): + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.outdir = args.outdir + self.sample = args.sample + + + @utils.add_log + def run(self): + barcode_report = f'{self.outdir}/../02.truse_assemble/TRUST4/{self.sample}_barcode_report.tsv' + res = beauty_res(self.outdir, barcode_report) + filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')] + fre = [''] * filtered.shape[0] + filtered.insert(filtered.shape[1], 'Frequent', fre) + + clones = filtered.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'}) + clones = clones.sort_values(by='Frequent', ascending=False) + + clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t') + + +@utils.add_log +def res_filter(args): + step_name = 'res_filter' + res_filter_obj = Res_filter(args, step_name) + res_filter_obj.run() + + +def get_opts_res_filter(parser, sub_program): + if sub_program: + parser = s_common(parser) \ No newline at end of file diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py new file mode 100644 index 00000000..1f09ab19 --- /dev/null +++ b/celescope/trust_vdj/trust_assemble.py @@ -0,0 +1,130 @@ +import os +from celescope.tools import utils +from celescope.tools.Step import Step, s_common +from celescope.tracer_vdj.split_fastq import get_barcodes +from celescope.tools.barcode import * +import pysam +import pandas as pd + + +TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4' + + +def count_fq(fq1): + bcs, umis, names = [], [], [] + count_df = pd.DataFrame() + with pysam.FastxFile(fq1) as fq: + for entry in fq: + attr = entry.sequence + cb = attr[:24] + umi = attr[24:] + name = entry.name + bcs.append(cb) + umis.append(umi) + names.append(name) + count_df['barcode'] = bcs + count_df['UMI'] = umis + count_df['seq_name'] = names + + return count_df + +@utils.add_log +def match_barcodes(outdir, match_dir, Seqtype, fq1): + annotated_bcs = get_barcodes(match_dir, Seqtype) + bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode']) + count_df = count_fq(fq1) + df = pd.merge(bcs_df, count_df, on='barcode', how='inner') + seqnames = df['seq_name'].tolist() + seqlist = open(f'{outdir}/seqlist.txt', 'w') + for name in seqnames: + seqlist.write(str(name) + '\n') + + count_df.to_csv(f'{outdir}/count.txt', sep='\t') + df.to_csv(f'{outdir}/matched_count.txt', sep='\t') + + +class Trust_assemble(Step): + """ + Features + + - Get fq file + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.outdir = args.outdir + self.match_dir = args.match_dir + self.Seqtype = args.Seqtype + self.fq1 = args.fq1 + self.fq2 = args.fq2 + self.sample = args.sample + self.species = args.species + + + @utils.add_log + def getFqfile(self): + match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1) + + cmd1 = ( + f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R1.fq' + ) + os.system(cmd1) + + cmd2 = ( + f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R2.fq' + ) + os.system(cmd2) + + + @utils.add_log + def run(self): + + self.getFqfile() + + species = self.species + + if species =='Mmus': + index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/GRCm38_bcrtcr.fa' + ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/mouse_IMGT+C.fa' + elif species == 'Hsap': + index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/hg38_bcrtcr.fa' + ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa' + cmd = ( + f'{TRUST} -t {self.thread} ' + f'-u {self.outdir}/{self.sample}_R2.fq ' + f'--barcode {self.outdir}/{self.sample}_R1.fq ' + f'--barcodeRange 0 23 + ' + f'-f {index_file} ' + f'--ref {ref} ' + f'-o {self.sample} --od {self.outdir}/TRUST4' + ) + + os.system(cmd) + + os.remove(f'{self.outdir}/seqlist.txt') + + +@utils.add_log +def trust_assemble(args): + step_name = 'trust_assemble' + trust_assemble_obj = Trust_assemble(args, step_name) + trust_assemble_obj.run() + + +def get_opts_trust_assemble(parser, sub_program): + if sub_program: + parser = s_common(parser) + parser.add_argument('--fq1', help='R1 reads from barcode step', required=True) + parser.add_argument('--fq2', help='R2 reads from barcode step', required=True) + parser.add_argument('--match_dir', help='match_dir', required=True) + parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + + + + + + + + -- Gitee From a43ab53fc1172a8cc25761c8c672169828e98210 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 11 Jun 2021 10:43:47 +0800 Subject: [PATCH 37/96] repo from zhouyiqi to singleron-RD --- setup.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index 963f7d92..64798773 100755 --- a/setup.py +++ b/setup.py @@ -20,10 +20,10 @@ setuptools.setup( version=__VERSION__, author="zhouyiqi", author_email="zhouyiqi@singleronbio.com", - description="GEXSCOPE Single cell analysis", + description="Single Cell Analysis Pipelines", long_description=long_description, long_description_content_type="text/markdown", - url="https://github.com/zhouyiqi91/CeleScope", + url="https://github.com/singleron-RD/CeleScope", packages=setuptools.find_packages(), classifiers=[ "Programming Language :: Python :: 3", -- Gitee From 32ac9d044e15b7b1f12fc1f2629fe19105faa256 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 11 Jun 2021 11:04:03 +0800 Subject: [PATCH 38/96] lint --- celescope/vdj/count_vdj.py | 70 ++++++++++++++++++++++---------------- 1 file changed, 40 insertions(+), 30 deletions(-) diff --git a/celescope/vdj/count_vdj.py b/celescope/vdj/count_vdj.py index 04b2c32e..414a5bf4 100755 --- a/celescope/vdj/count_vdj.py +++ b/celescope/vdj/count_vdj.py @@ -43,7 +43,8 @@ class Count_vdj(Step): if (not args.match_dir) or (args.match_dir == "None"): self.match_bool = False if self.match_bool: - self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file(args.match_dir) + self.match_cell_barcodes, _match_cell_number = utils.read_barcode_file( + args.match_dir) # out files self.cell_confident_file = f"{self.out_prefix}_cell_confident.tsv" @@ -59,7 +60,8 @@ class Count_vdj(Step): df_UMI_sum = df_UMI_count_filter.groupby( ['barcode'], as_index=False).agg({"UMI": "sum"}) if (self.args.UMI_min == "auto"): - df_UMI_sum_sorted = df_UMI_sum.sort_values(["UMI"], ascending=False) + df_UMI_sum_sorted = df_UMI_sum.sort_values( + ["UMI"], ascending=False) rank_UMI = df_UMI_sum_sorted.iloc[CELL_CALLING_RANK, :]["UMI"] UMI_min = int(rank_UMI / 10) else: @@ -71,7 +73,7 @@ class Count_vdj(Step): df = df_UMI_sum.sort_values('UMI', ascending=False) self.add_data_item(CB_num=df[df['mark'] == 'CB'].shape[0]) self.add_data_item(Cells=list(df.loc[df['mark'] == 'CB', 'UMI'])) - self.add_data_item(UB_num= df[df['mark'] == 'UB'].shape[0]) + self.add_data_item(UB_num=df[df['mark'] == 'UB'].shape[0]) self.add_data_item(Background=list(df.loc[df['mark'] == 'UB', 'UMI'])) cell_barcodes = set(df_UMI_cell.barcode) @@ -81,7 +83,8 @@ class Count_vdj(Step): value=total_cell_number, ) - df_cell = df_UMI_count_filter[df_UMI_count_filter.barcode.isin(cell_barcodes)] + df_cell = df_UMI_count_filter[df_UMI_count_filter.barcode.isin( + cell_barcodes)] return df_cell, cell_barcodes @utils.add_log @@ -97,7 +100,7 @@ class Count_vdj(Step): ["barcode", "chain"], as_index=False).head(1) return df_confident - def get_df_valid_count(self,df_confident): + def get_df_valid_count(self, df_confident): df_valid_count = df_confident.set_index(["barcode", "chain"]) df_valid_count = df_valid_count.unstack() df_valid_count.columns = ['_'.join(col) for col in df_valid_count] @@ -105,14 +108,13 @@ class Count_vdj(Step): df_valid_count.fillna(inplace=True, value="NA") return df_valid_count - def get_clonetypes_and_write(self, df_valid_count, cell_barcodes): """ Returns - df_clonetypes - df_match_clonetypes """ - + total_cell_number = len(cell_barcodes) df_clonetypes = df_valid_count.copy() df_match_clonetypes = None @@ -121,7 +123,8 @@ class Count_vdj(Step): "barcode": "count"}) # put na last df_clonetypes.replace('NA', np.nan, inplace=True) - df_clonetypes.sort_values(["barcode"] + self.cols, ascending=False, na_position='last', inplace=True) + df_clonetypes.sort_values( + ["barcode"] + self.cols, ascending=False, na_position='last', inplace=True) df_clonetypes.replace(np.nan, 'NA', inplace=True) total_CDR3_barcode_number = sum(df_clonetypes.barcode) @@ -138,7 +141,8 @@ class Count_vdj(Step): # order order = ["clonetype_ID"] + self.cols + ["barcode", "percent"] df_clonetypes = df_clonetypes[order] - df_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) + df_clonetypes.rename( + columns={"barcode": "barcode_count"}, inplace=True) # out clonetypes df_clonetypes.to_csv(self.clonetypes_file, sep="\t", index=False) @@ -194,11 +198,11 @@ class Count_vdj(Step): total=total_cell_number ) - # BCR elif self.args.type == "BCR": - UMI_col_dic = {"IGH": "UMI_IGH", "IGL": "UMI_IGL", "IGK": "UMI_IGK"} + UMI_col_dic = {"IGH": "UMI_IGH", + "IGL": "UMI_IGL", "IGK": "UMI_IGK"} for chain in UMI_col_dic: UMI_col_name = UMI_col_dic[chain] if UMI_col_name in df_valid_count.columns: @@ -271,24 +275,26 @@ class Count_vdj(Step): df_match_clonetypes["percent"] = df_match_clonetypes["percent"].apply( lambda x: round(x, 2) ) - df_match_clonetypes.rename(columns={"barcode": "barcode_count"}, inplace=True) + df_match_clonetypes.rename( + columns={"barcode": "barcode_count"}, inplace=True) df_match_clonetypes = df_match_clonetypes.merge( df_clonetypes, on=self.cols, how='left', suffixes=('', '_y')) # order and drop duplicated cols order = ["clonetype_ID"] + self.cols + ["barcode_count", "percent"] df_match_clonetypes = df_match_clonetypes[order] - df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[False,True], inplace=True) + df_match_clonetypes.sort_values(["barcode_count", "clonetype_ID"], ascending=[ + False, True], inplace=True) df_match_clonetypes.to_csv( self.match_clonetypes_file, sep="\t", index=False) return df_clonetypes, df_match_clonetypes - def write_cell_confident_count(self, df_valid_count, df_clonetypes, df_confident): df_mergeID = pd.merge(df_valid_count, - df_clonetypes, how="left", on=self.cols) + df_clonetypes, how="left", on=self.cols) df_mergeID.sort_values(["clonetype_ID", "barcode"], inplace=True) # output df_valid_count - df_mergeID.to_csv(self.cell_confident_count_file, sep="\t", index=False) + df_mergeID.to_csv(self.cell_confident_count_file, + sep="\t", index=False) df_mergeID = df_mergeID[["barcode", "clonetype_ID"]] df_cell_confident_with_ID = pd.merge( df_confident, df_mergeID, how="left", on="barcode") @@ -298,9 +304,8 @@ class Count_vdj(Step): df_cell_confident_with_ID.to_csv( self.cell_confident_file, sep="\t", index=False) - def write_clonetypes_table_to_data(self, df_clonetypes, df_match_clonetypes): - # cloneytpes table + # cloneytpes table def format_table(df_clonetypes): df_table = df_clonetypes.copy() df_table["percent"] = df_table["percent"].apply( @@ -310,7 +315,8 @@ class Count_vdj(Step): for chain in self.chains: for seq in seqs: cols.append("_".join([seq, chain])) - df_table_cols = ["clonetype_ID"] + cols + ["barcode_count", "percent"] + df_table_cols = ["clonetype_ID"] + \ + cols + ["barcode_count", "percent"] df_table = df_table[df_table_cols] table_header = ["Clonetype_ID"] + cols + ["Frequency", "Percent"] return df_table, table_header @@ -325,18 +331,21 @@ class Count_vdj(Step): self.add_data_item(table_dict=table_dict) def run(self): - df_UMI_count_filter = pd.read_csv(self.args.UMI_count_filter_file, sep='\t') + df_UMI_count_filter = pd.read_csv( + self.args.UMI_count_filter_file, sep='\t') df_cell, cell_barcodes = self.cell_calling(df_UMI_count_filter) df_confident = self.get_df_confident(df_cell) df_valid_count = self.get_df_valid_count(df_confident) - df_clonetypes, df_match_clonetypes = self.get_clonetypes_and_write(df_valid_count, cell_barcodes) - self.write_cell_confident_count(df_valid_count, df_clonetypes, df_confident) + df_clonetypes, df_match_clonetypes = self.get_clonetypes_and_write( + df_valid_count, cell_barcodes) + self.write_cell_confident_count( + df_valid_count, df_clonetypes, df_confident) self.write_clonetypes_table_to_data(df_clonetypes, df_match_clonetypes) self.clean_up() def count_vdj(args): - # TODO + # TODO # add TCR or BCR prefix to distinguish them in html report summary; should improve step_name = f"{args.type}_count_vdj" count_vdj_obj = Count_vdj(args, step_name) @@ -344,25 +353,26 @@ def count_vdj(args): def get_opts_count_vdj(parser, sub_program): - parser.add_argument("--type", help="Required. `TCR` or `BCR`. ", required=True) + parser.add_argument( + "--type", help="Required. `TCR` or `BCR`. ", required=True) parser.add_argument( '--UMI_min', - help='Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.', + help='Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell.', default="auto" ) parser.add_argument( - '--iUMI', + '--iUMI', help="""Default `1`. Minimum number of UMI of identical receptor type and CDR3. -For each (barcode, chain) combination, only UMI>=iUMI is considered valid.""", +For each (barcode, chain) combination, only UMI>=iUMI is considered valid.""", type=int, default=1 ) if sub_program: - parser.add_argument("--UMI_count_filter_file", help="Required. File from step mapping_vdj.", required=True) + parser.add_argument("--UMI_count_filter_file", + help="Required. File from step mapping_vdj.", required=True) parser.add_argument( - "--match_dir", + "--match_dir", help="Match celescope scRNA-Seq directory. ", default=None ) parser = s_common(parser) - -- Gitee From 7ee397d4cb6e0e163dba38f4a6f6b5b572e5b84f Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 11 Jun 2021 14:38:46 +0800 Subject: [PATCH 39/96] rm redundant codes --- celescope/rna/star.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/celescope/rna/star.py b/celescope/rna/star.py index 2af181e2..3e9e1f4d 100755 --- a/celescope/rna/star.py +++ b/celescope/rna/star.py @@ -44,9 +44,9 @@ class Star_rna(Step, StarMixin): # parse self.refflat = f"{self.genomeDir}/{self.genome['refflat']}" - self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt' - self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log' - self.picard_region_log = f'{self.outdir}/{self.sample}_region.log' + self.ribo_log = f'{self.out_prefix}_ribo_log.txt' + self.ribo_run_log = f'{self.out_prefix}_ribo_run.log' + self.picard_region_log = f'{self.out_prefix}_region.log' self.plot = None self.stats = pd.Series() @@ -113,9 +113,8 @@ class Star_rna(Step, StarMixin): @utils.add_log def ribo(self): + # TODO remove bbduk.sh and use picard ribo bases human_ribo_fa = f'{ROOT_PATH}/data/rRNA/human_ribo.fasta' - self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt' - self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log' cmd = ( f'bbduk.sh ' f'in1={self.fq} ' -- Gitee From 84224f8510bb45f7606660358026d7d1aaff5343 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 11 Jun 2021 19:41:52 +0800 Subject: [PATCH 40/96] fix bug if one chain miss in all cells --- .../html/tracer_vdj/go_assemble_summary.html | 14 +- .../html/tracer_vdj/vdj_sum_summary.html | 14 +- celescope/tracer_vdj/go_assemble.py | 8 +- celescope/tracer_vdj/vdj_sum.py | 376 ++++++++---------- celescope/trust_vdj/res_filter.py | 2 +- 5 files changed, 179 insertions(+), 235 deletions(-) diff --git a/celescope/templates/html/tracer_vdj/go_assemble_summary.html b/celescope/templates/html/tracer_vdj/go_assemble_summary.html index 5bb8c0bf..be42468a 100644 --- a/celescope/templates/html/tracer_vdj/go_assemble_summary.html +++ b/celescope/templates/html/tracer_vdj/go_assemble_summary.html @@ -3,14 +3,14 @@
{% for item in go_assemble_summary %} diff --git a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html index 7b5fdb08..fc2e6d7c 100644 --- a/celescope/templates/html/tracer_vdj/vdj_sum_summary.html +++ b/celescope/templates/html/tracer_vdj/vdj_sum_summary.html @@ -9,18 +9,18 @@

Cells with IGL: Cells with full length IGL.

Cells with paired IGH and IGK: Cells with paired IGH and IGK.

Cells with paired IGH and IGL: Cells with paired IGH and IGL.

-

Median UMIs per cell: Median total UMIs per cell.

-

Median IGH UMIs per cell: Median UMIs mapped to IGH.

-

Median IGK UMIs per cell: Median UMIs mapped to IGK.

-

Median IGL UMIs per cell: Median UMIs mapped to IGL.

+

Median read count per cell: Median total read count per cell.

+

Median IGH read count per cell: Median read count mapped to IGH.

+

Median IGK read count per cell: Median read count mapped to IGK.

+

Median IGL read count per cell: Median read count mapped to IGL.

If type is TCR:

Estimated Number of Cells: Number of cells which contain full length TRA or TRB.

Cells with TRA: Cells with full length TRA.

Cells with TRB: Cells with full length TRB.

Cells with paired TRA and TRB: Cells with paired TRA and TRB.

-

Median UMIs per cell: Median UMIs mapped to TRA and TRB.

-

Median TRA UMIs per cell: Median UMIs mapped to TRA.

-

Median TRB UMIs per cell: Median UMIs mapped to TRB.

+

Median read count per cell: Median read count mapped to TRA and TRB.

+

Median TRA read count per cell: Median read count mapped to TRA.

+

Median TRB read count per cell: Median read count mapped to TRB.

{% for item in vdj_sum_summary %} diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 452eb54d..3a291d39 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -227,9 +227,13 @@ class Go_assemble(Step): @utils.add_log def run(self): if self.Seqtype == 'TCR': - self.run_tracer() + tracer_dir = f'{self.outdir}/tracer/filtered_TCRAB_summary/recombinants.txt' + if not os.path.exists(tracer_dir): + self.run_tracer() elif self.Seqtype == 'BCR': - self.run_bracer() + bracer_dir = f'{self.outdir}/bracer/filtered_BCR_summary/changeodb.tab' + if not os.path.exists(bracer_dir): + self.run_bracer() self.clean_up() diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index d3896db0..73772bc4 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -1,4 +1,5 @@ import os +from re import I import pandas as pd from Bio.Seq import Seq import numpy as np @@ -9,17 +10,16 @@ import glob import pysam -def get_umi_count(fq): - umis = [] +def get_read_count(fq): + count = 0 with pysam.FastxFile(fq) as fh: for entry in fh: - attr = entry.name.split('_') - umi = attr[1] - umis.append(umi) - res = len(set(umis)) - return res + count += 1 + return count + +@utils.add_log def tpm_count(ass_dir): rec = pd.read_csv(f'{ass_dir}/tracer/filtered_TCRAB_summary/recombinants.txt', sep='\t') # ass_dir outdir/sample/04.go_assemble @@ -41,54 +41,79 @@ def tpm_count(ass_dir): return productive +@utils.add_log def filtering(Seqtype, ass_dir, outdir): if not os.path.exists(outdir): os.makedirs(outdir) if Seqtype == 'TCR': data = tpm_count(ass_dir) - cell_name = set(list(data['cell_name'])) + cell_name = list(set(list(data['cell_name']))).sort() filtered = pd.DataFrame() - for name in cell_name: - count_data = data[data['cell_name'] == name] - tra = count_data[count_data['locus'] == 'A'] - trb = count_data[count_data['locus'] == 'B'] - if tra.empty is not True: - tra = tra.sort_values(by='TPM', ascending=False) - tra = tra.head(1) - filtered = filtered.append(tra, ignore_index=True) - if trb.empty is not True: - trb = trb.sort_values(by='TPM', ascending=False) - trb = trb.head(1) - filtered = filtered.append(trb, ignore_index=True) + df = pd.DataFrame(cell_name, columns=['cell_name']) + loci = ['A', 'B'] + for locus in loci: + tmp = data[data['locus']==locus] + tmp = tmp.sort_values(by='TPM', ascending=False) + tmp = tmp.drop_duplicates('cell_name', 'first') + filtered = filtered.append(tmp, ignore_index=True) + + tmp = tmp.rename(columns={'CDR3aa': f'TR{locus}_CDR3aa'}) + clones = tmp[['cell_name', f'TR{locus}_CDR3aa']] + df = pd.merge(df, clones, on='cell_name', how='outer') + + df = df.fillna('None') + clonetypes = df.groupby(['TRA_CDR3aa', 'TRB_CDR3aa']).agg({'cell_name': 'count'}) + clonetypes = clonetypes.sort_values(by='cell_name', ascending=False) + clonetypes = clonetypes.rename(columns={'cell_name': 'Frequency'}) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') elif Seqtype == 'BCR': data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') - data = data[data['FUNCTIONAL'] == True] - cell_name = set(list(data['CELL'])) + data = data[(data['FUNCTIONAL'] == True) & (data['IN_FRAME'] == True)] + cell_name = list(set(data['CELL'].tolist())).sort() filtered = pd.DataFrame() - for name in cell_name: - count_cell = data[data['CELL'] == name] - count_h = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'H']) - count_k = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'K']) - count_l = pd.DataFrame(count_cell[count_cell['LOCUS'] == 'L']) - count_k_l = count_k.append(count_l) - if count_h.empty is not True: - count_h = count_h.sort_values(by='TPM', ascending=False) - count_h = count_h.head(1) - filtered = filtered.append(count_h, ignore_index=True) - if count_k_l.empty is not True: - count_k_l = count_k_l.sort_values(by='TPM', ascending=False) - count_k_l = count_k_l.head(1) - filtered = filtered.append(count_k_l, ignore_index=True) + tmp = data[data['LOCUS'] == 'H'] + tmp = tmp.sort_values(by='TPM', ascending=False) + tmp = tmp.drop_duplicates('CELL', 'first') + filtered = filtered.append(tmp, ignore_index=True) + + tmp2 = data[data['LOCUS'] != 'H'] + tmp2 = tmp2.sort_values(by='TPM', ascending=False) + tmp2 = tmp2.drop_duplicates('CELL', 'first') + filtered = filtered.append(tmp2, ignore_index=True) + + df = pd.DataFrame(cell_name, columns=['CELL']) + + loci = ['H', 'L', 'K'] + for locus in loci: + tmp = filtered[filtered['LOCUS'] == locus][['CELL', 'JUNCTION']] + tmp.columns = ['CELL', f'JUNCTION_{locus}'] + ntseqs = tmp[f'JUNCTION_{locus}'].tolist() + tmplist = [] + for nt in ntseqs: + nt = Seq(nt) + nt = nt.reverse_complement() + tmplist.append(str(nt)) + tmp.insert(tmp.shape[1], f'IG{locus}_CDR3aa', tmplist) + + df = pd.merge(df, tmp, on='CELL', how='outer') + + df = df.fillna('None') + + clonetypes = df.groupby(['IGH_CDR3aa', 'IGL_CDR3aa', 'IGK_CDR3aa']).agg({'CELL': 'count'}) + clonetypes = clonetypes.sort_values(by='CELL', ascending=False) + clonetypes = clonetypes.rename(columns={'CELL': 'Frequency'}) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + filtered = filtered.rename(columns={'CELL': 'cell_name'}) filtered.to_csv(f'{outdir}/filtered.txt', sep='\t') - return filtered - class Vdj_sum(Step): """ @@ -116,62 +141,32 @@ class Vdj_sum(Step): fastq_dir = self.fastq_dir Seqtype = self.Seqtype - results = filtering(Seqtype, ass_dir, outdir) + filtering(Seqtype, ass_dir, outdir) + + filter_data = pd.read_csv(f'{outdir}/filtered.txt', sep='\t') stat_file = outdir + '/stat.txt' vdj_sum_summary = [] count_umi_file = f'{fastq_dir}/../{self.sample}_count.txt' - count_umi = pd.read_csv(count_umi_file, sep='\t', index_col=0) - median_all = int(count_umi['UMI'].median()) - if Seqtype == 'TCR': + clonetypes = pd.read_csv(f'{outdir}/clonetypes.tsv', sep='\t') - productive_cells = set(results['cell_name'].tolist()) + productive_cells = set(filter_data['cell_name'].tolist()) + productive_cells_num = len(productive_cells) + if Seqtype == 'TCR': + # barcode umi plot count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") count_umi.to_csv(count_umi_file, sep='\t') self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - productive_cells_num = len(productive_cells) - - TRA_chain = results[results['locus'] == 'A'] - TRA_chain_num = TRA_chain.shape[0] - TRB_chain = results[results['locus'] == 'B'] - TRB_chain_num = TRB_chain.shape[0] - - TRAs, TRBs = [], [] - paired_cell = 0 - for cell in productive_cells: - tmp1 = TRA_chain[TRA_chain['cell_name'] == cell] - if tmp1.empty is not True: - chainA = tmp1['CDR3aa'].tolist()[0] - TRAs.append(chainA) - else: - TRAs.append('NaN') - - tmp2 = TRB_chain[TRB_chain['cell_name'] == cell] - if tmp2.empty is not True: - chainB = tmp2['CDR3aa'].tolist()[0] - TRBs.append(chainB) - else: - TRBs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_cell += 1 - - clonetypes_table = pd.DataFrame() - clonetypes_table['TRA_chain'] = TRAs - clonetypes_table['TRB_chain'] = TRBs - clonetypes_table['Frequency'] = '' - - clonetypes = clonetypes_table.groupby(['TRA_chain', 'TRB_chain']).agg({'Frequency': 'count'}) - + # clonetype table sum_c = clonetypes['Frequency'].sum() proportions = [] for f in list(clonetypes['Frequency']): @@ -184,10 +179,13 @@ class Vdj_sum(Step): clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) clonetypes = clonetypes.reset_index() - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'TRA_chain', 'TRB_chain', 'Frequency', 'Proportion'])) + clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes['TRA_CDR3aa'] = clonetypes.TRA_CDR3aa.apply(lambda x: 'C'+str(x)+'F' if x != 'None' else 'None') + clonetypes['TRB_CDR3aa'] = clonetypes.TRB_CDR3aa.apply(lambda x: 'C'+str(x)+'F' if x != 'None' else 'None') - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequency', 'Proportion'])) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t', index=None) vdj_sum_summary.append({ 'item': 'Estimated Number of Cells', @@ -195,17 +193,18 @@ class Vdj_sum(Step): 'total_count': np.nan, }) - vdj_sum_summary.append({ - 'item': 'Cells with TRA', - 'count': TRA_chain_num, - 'total_count': productive_cells_num, - }) + loci = ['A', 'B'] - vdj_sum_summary.append({ - 'item': 'Cells with TRB', - 'count': TRB_chain_num, - 'total_count': productive_cells_num, - }) + for locus in loci: + tmp = int(clonetypes[clonetypes[f'TR{locus}_CDR3aa'] != 'None']['Frequency'].sum()) + + vdj_sum_summary.append({ + 'item': f'Cells with TR{locus}', + 'count': tmp, + 'total_count': productive_cells_num, + }) + + paired_cell = int(clonetypes[(clonetypes['TRA_CDR3aa'] != 'None') & (clonetypes['TRB_CDR3aa'] != 'None')]['Frequency'].sum()) vdj_sum_summary.append({ 'item': 'Cells with paired TRA and TRB', @@ -213,97 +212,47 @@ class Vdj_sum(Step): 'total_count': productive_cells_num, }) - TRAs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_A.fastq') - TRBs = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_B.fastq') - TRA_UMIs = [get_umi_count(fq) for fq in TRAs] - TRB_UMIs = [get_umi_count(fq) for fq in TRBs] - - medianA = int(np.median(TRA_UMIs)) - medianB = int(np.median(TRB_UMIs)) - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', + 'item': 'Median read count per cell', 'count': median_all, 'total_count': np.nan - }) - - vdj_sum_summary.append({ - 'item': 'Median TRA UMIs per cell', - 'count': medianA, - 'total_count': np.nan - }) + }) - vdj_sum_summary.append({ - 'item': 'Median TRB UMIs per cell', - 'count': medianB, - 'total_count': np.nan - }) + for locus in loci: + tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq') + if len(tmp) != 0: + read_count = [get_read_count(fq) for fq in tmp] + read_count.sort() + for i in range(len(read_count)): + if read_count[i] != 0: + idx = i + break + read_count = read_count[idx:] + median_tmp = int(np.median(read_count)) + vdj_sum_summary.append({ + 'item': f'Median TR{locus} read count per cell', + 'count': median_tmp, + 'total_count': np.nan + }) + else: + vdj_sum_summary.append({ + 'item': f'Median TR{locus} read count per cell', + 'count': 0, + 'total_count': np.nan + }) elif Seqtype == 'BCR': - productive_cells = set(results['CELL'].tolist()) - - productive_cells_num = len(productive_cells) - + # barcode umi plot count_umi['mark'] = count_umi['cell_name'].apply(lambda x: "CB" if (x in productive_cells) else "UB") count_umi.to_csv(count_umi_file, sep='\t') self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_umi_file)) - results_h = results[results['LOCUS'] == 'H'] - results_k = results[results['LOCUS'] == 'K'] - results_l = results[results['LOCUS'] == 'L'] - results_h_count = results_h.shape[0] - results_k_count = results_k.shape[0] - results_l_count = results_l.shape[0] - - IGHs, IGKs, IGLs = [], [], [] - - paired_k, paired_l = 0, 0 - - for cell in productive_cells: - tmp1 = results_h[results_h['CELL'] == cell] - if tmp1.empty is not True: - seq = tmp1['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGHs.append(aaseq) - else: - IGHs.append('NaN') - - tmp2 = results_l[results_l['CELL'] == cell] - if tmp2.empty is not True: - seq = tmp2['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGLs.append(aaseq) - else: - IGLs.append('NaN') - - tmp3 = results_k[results_k['CELL'] == cell] - if tmp3.empty is not True: - seq = tmp3['JUNCTION'].tolist()[0] - seq = Seq(seq) - aaseq = seq.translate() - IGKs.append(aaseq) - else: - IGKs.append('NaN') - - if not tmp1.empty and not tmp2.empty: - paired_l += 1 - if not tmp1.empty and not tmp3.empty: - paired_k += 1 - - clonetypes_table = pd.DataFrame() - - clonetypes_table['IGH_chain'] = IGHs - clonetypes_table['IGL_chain'] = IGLs - clonetypes_table['IGK_chain'] = IGKs - clonetypes_table['Frequency'] = '' - clonetypes = clonetypes_table.groupby(['IGH_chain', 'IGL_chain', 'IGK_chain']).agg({'Frequency': 'count'}) + # clone type table Proportion = [] sum_c = clonetypes['Frequency'].sum() @@ -317,9 +266,10 @@ class Vdj_sum(Step): clonetypes = clonetypes.sort_values(by='Frequency', ascending=False) clonetypes = clonetypes.reset_index() - clonetypes['clonetypeId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['clonetypeId', 'IGH_chain', 'IGL_chain', 'IGK_chain', 'Frequency', 'Proportion'])) - clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t') + clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['CloneId', 'IGH_CDR3aa', 'IGL_CDR3aa', 'IGK_CDR3aa', 'Frequency', 'Proportion'])) + + clonetypes.to_csv(f'{outdir}/clonetypes.tsv', sep='\t', index=None) vdj_sum_summary.append({ @@ -328,78 +278,68 @@ class Vdj_sum(Step): 'total_count': np.nan }) - vdj_sum_summary.append({ - 'item': 'Cells with IGH', - 'count': results_h_count, - 'total_count': productive_cells_num - }) + loci = ['H', 'L', 'K'] - vdj_sum_summary.append({ - 'item': 'Cells with IGK', - 'count': results_k_count, - 'total_count': productive_cells_num - }) + for locus in loci: + tmp = int(clonetypes[clonetypes[f'IG{locus}_CDR3aa']!='None']['Frequency'].sum()) - vdj_sum_summary.append({ - 'item': 'Cells with IGL', - 'count': results_l_count, - 'total_count': productive_cells_num - }) + vdj_sum_summary.append({ + 'item': f'Cells with IG{locus}', + 'count': tmp, + 'total_count': productive_cells_num + }) - vdj_sum_summary.append({ - 'item': 'Cells with paired IGH and IGK', - 'count': paired_k, - 'total_count': productive_cells_num - }) + paired_H_L = int(clonetypes[(clonetypes['IGH_CDR3aa']!='None') & (clonetypes['IGL_CDR3aa']!='None')]['Frequency'].sum()) vdj_sum_summary.append({ 'item': 'Cells with paired IGH and IGL', - 'count': paired_l, + 'count': paired_H_L, 'total_count': productive_cells_num }) - IGHs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_H.fastq') - IGKs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_K.fastq') - IGLs = glob.glob(f'{outdir}/bracer/*/aligned_reads/*_BCR_L.fastq') - - IGH_UMIs = [get_umi_count(fq) for fq in IGHs] - IGK_UMIs = [get_umi_count(fq) for fq in IGKs] - IGL_UMIs = [get_umi_count(fq) for fq in IGLs] - - medianH = int(np.median(IGH_UMIs)) - medianL = int(np.median(IGL_UMIs)) - medianK = int(np.median(IGK_UMIs)) - - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': median_all, - 'total_count': np.nan - }) + paired_H_K = int(clonetypes[(clonetypes['IGH_CDR3aa']!='None') & (clonetypes['IGK_CDR3aa']!='None')]['Frequency'].sum()) vdj_sum_summary.append({ - 'item': 'Median IGH UMIs per cell', - 'count': medianH, - 'total_count': np.nan + 'item': 'Cells with paired IGH and IGK', + 'count': paired_H_K, + 'total_count': productive_cells_num }) vdj_sum_summary.append({ - 'item': 'Median IGL UMIs per cell', - 'count': medianL, + 'item': 'Median read count per cell', + 'count': median_all, 'total_count': np.nan }) - vdj_sum_summary.append({ - 'item': 'Median IGK UMIs per cell', - 'count': medianK, - 'total_count': np.nan - }) + for locus in loci: + tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq') + if len(tmp) != 0: + read_count = [get_read_count(fq) for fq in tmp] + read_count.sort() + for i in range(len(read_count)): + if read_count[i] != 0: + idx = i + break + read_count = read_count[idx:] + median_tmp = int(np.median(read_count)) + vdj_sum_summary.append({ + 'item': f'Median IG{locus} read count per cell', + 'count': median_tmp, + 'total_count': np.nan + }) + else: + vdj_sum_summary.append({ + 'item': f'Median IG{locus} read count per cell', + 'count': 0, + 'total_count': np.nan + }) df = pd.DataFrame(vdj_sum_summary, columns=['item', 'count', 'total_count']) utils.gen_stat(df, stat_file) - # clonetype table + # clonetype table title = 'Clonetypes' table_dict = self.get_table(title, 'clonetypes_table', clonetypes) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index ce46c374..ce6b1e2d 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -63,7 +63,7 @@ class Res_filter(Step): @utils.add_log def run(self): - barcode_report = f'{self.outdir}/../02.truse_assemble/TRUST4/{self.sample}_barcode_report.tsv' + barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' res = beauty_res(self.outdir, barcode_report) filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')] fre = [''] * filtered.shape[0] -- Gitee From f494a4b8cad5f22924fc7cc023aacd1bbc302b43 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 15 Jun 2021 14:10:42 +0800 Subject: [PATCH 41/96] change read count to umi count --- celescope/tracer_vdj/vdj_sum.py | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 73772bc4..01a25116 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -10,12 +10,17 @@ import glob import pysam -def get_read_count(fq): - count = 0 + +def get_umi_count(fq): + umis = [] with pysam.FastxFile(fq) as fh: for entry in fh: - count += 1 - + name = entry.name + name = name.split('_') + umi = name[1] + umis.append(umi) + count = len(set(umis)) + return count @@ -48,7 +53,7 @@ def filtering(Seqtype, ass_dir, outdir): if Seqtype == 'TCR': data = tpm_count(ass_dir) - cell_name = list(set(list(data['cell_name']))).sort() + cell_name = sorted(list(set(list(data['cell_name'])))) filtered = pd.DataFrame() df = pd.DataFrame(cell_name, columns=['cell_name']) loci = ['A', 'B'] @@ -75,7 +80,7 @@ def filtering(Seqtype, ass_dir, outdir): data = pd.read_csv(f'{ass_dir}/bracer/filtered_BCR_summary/changeodb.tab', sep='\t') data = data[(data['FUNCTIONAL'] == True) & (data['IN_FRAME'] == True)] - cell_name = list(set(data['CELL'].tolist())).sort() + cell_name = sorted(list(set(data['CELL'].tolist()))) filtered = pd.DataFrame() tmp = data[data['LOCUS'] == 'H'] @@ -213,7 +218,7 @@ class Vdj_sum(Step): }) vdj_sum_summary.append({ - 'item': 'Median read count per cell', + 'item': 'Median UMIs per cell', 'count': median_all, 'total_count': np.nan }) @@ -221,7 +226,7 @@ class Vdj_sum(Step): for locus in loci: tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq') if len(tmp) != 0: - read_count = [get_read_count(fq) for fq in tmp] + read_count = [get_umi_count(fq) for fq in tmp] read_count.sort() for i in range(len(read_count)): if read_count[i] != 0: @@ -230,13 +235,13 @@ class Vdj_sum(Step): read_count = read_count[idx:] median_tmp = int(np.median(read_count)) vdj_sum_summary.append({ - 'item': f'Median TR{locus} read count per cell', + 'item': f'Median TR{locus} UMIs per cell', 'count': median_tmp, 'total_count': np.nan }) else: vdj_sum_summary.append({ - 'item': f'Median TR{locus} read count per cell', + 'item': f'Median TR{locus} UMIs per cell', 'count': 0, 'total_count': np.nan }) @@ -306,7 +311,7 @@ class Vdj_sum(Step): }) vdj_sum_summary.append({ - 'item': 'Median read count per cell', + 'item': 'Median UMIs per cell', 'count': median_all, 'total_count': np.nan }) @@ -314,7 +319,7 @@ class Vdj_sum(Step): for locus in loci: tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq') if len(tmp) != 0: - read_count = [get_read_count(fq) for fq in tmp] + read_count = [get_umi_count(fq) for fq in tmp] read_count.sort() for i in range(len(read_count)): if read_count[i] != 0: @@ -323,13 +328,13 @@ class Vdj_sum(Step): read_count = read_count[idx:] median_tmp = int(np.median(read_count)) vdj_sum_summary.append({ - 'item': f'Median IG{locus} read count per cell', + 'item': f'Median IG{locus} UMIs per cell', 'count': median_tmp, 'total_count': np.nan }) else: vdj_sum_summary.append({ - 'item': f'Median IG{locus} read count per cell', + 'item': f'Median IG{locus} UMIs per cell', 'count': 0, 'total_count': np.nan }) -- Gitee From 6ffd3fe560e227996688e91ac6b21ab0fcbd8928 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 15 Jun 2021 14:10:57 +0800 Subject: [PATCH 42/96] rm paired fq --- celescope/tools/barcode.py | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py index 1c48f6e3..143ce6d1 100755 --- a/celescope/tools/barcode.py +++ b/celescope/tools/barcode.py @@ -225,17 +225,12 @@ class Barcode(Step): self.lowNum = args.lowNum self.lowQual = args.lowQual self.allowNoPolyT = args.allowNoPolyT - self.allowNoLinker = args.allowNoLinker - self.paired_fq = args.paired_fq - self.new_f1 = f'{self.outdir}/{self.sample}_new_R1.fq{suffix}' - self.new_f2 = f'{self.outdir}/{self.sample}_new_R2.fq{suffix}' + self.allowNoLinker = args.allowNoLinker @utils.add_log def run(self): fh3 = xopen(self.out_fq2, 'w') - new_f1 = xopen(self.new_f1, 'w') - new_f2 = xopen(self.new_f2, 'w') if self.nopolyT: fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w') @@ -383,11 +378,6 @@ class Barcode(Step): fh3.write(f'@{cb}_{umi}_{self.total_num}\n{seq2}\n+\n{qual2}\n') - if self.paired_fq: - - new_f1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n') - new_f2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n') - Barcode.run.logger.info(self.fq1_list[i] + ' finished.') fh3.close() @@ -498,7 +488,6 @@ def get_opts_barcode(parser, sub_program=True): parser.add_argument('--gzip', help="output gzipped fastq", action='store_true') parser.add_argument( '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto') - parser.add_argument('--paired_fq', help="output R1 R2", action='store_true') if sub_program: parser.add_argument('--fq1', help='read1 fq file', required=True) parser.add_argument('--fq2', help='read2 fq file', required=True) -- Gitee From 0c83c8bd0c7d567f1b062744547b48e7cb9804c5 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 15 Jun 2021 14:11:12 +0800 Subject: [PATCH 43/96] add convert step --- celescope/trust_vdj/__init__.py | 2 +- celescope/trust_vdj/convert.py | 336 +++++++++++++++++++++++++ celescope/trust_vdj/multi_trust_vdj.py | 15 +- 3 files changed, 350 insertions(+), 3 deletions(-) create mode 100644 celescope/trust_vdj/convert.py diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py index 69aeb6f7..f95b64a2 100644 --- a/celescope/trust_vdj/__init__.py +++ b/celescope/trust_vdj/__init__.py @@ -1,6 +1,6 @@ __STEPS__ = [ 'sample', - 'barcode', + 'convert', 'trust_assemble', 'res_filter'] __ASSAY__ = 'trust_vdj' diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py new file mode 100644 index 00000000..4039a215 --- /dev/null +++ b/celescope/trust_vdj/convert.py @@ -0,0 +1,336 @@ +"""barcode step.""" + +import os +import re +import subprocess +import sys +import glob +from collections import defaultdict, Counter +from itertools import combinations, product + +import pandas as pd +import pysam +from xopen import xopen + +import celescope.tools.utils as utils +from celescope.tools.__init__ import __PATTERN_DICT__ +from celescope.tools.Chemistry import Chemistry +from celescope.tools.barcode import * +from celescope.tools.Step import Step, s_common + + +class Convert(Step): + + '''convert step class + ''' + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.fq1_list = args.fq1.split(",") + self.fq2_list = args.fq2.split(",") + self.fq_number = len(self.fq1_list) + if self.fq_number != len(self.fq2_list): + raise Exception('fastq1 and fastq2 do not have same file number!') + if args.chemistry == 'auto': + ch = Chemistry(args.fq1) + self.chemistry_list = ch.check_chemistry() + else: + self.chemistry_list = [args.chemistry] * self.fq_number + self.barcode_corrected_num = 0 + self.linker_corrected_num = 0 + self.total_num = 0 + self.clean_num = 0 + self.no_polyT_num = 0 + self.lowQual_num = 0 + self.no_linker_num = 0 + self.no_barcode_num = 0 + self.barcode_qual_Counter = Counter() + self.umi_qual_Counter = Counter() + if args.gzip: + suffix = ".gz" + else: + suffix = "" + self.out_fq1 = f'{self.outdir}/{self.sample}_1.fq{suffix}' + self.out_fq2 = f'{self.outdir}/{self.sample}_2.fq{suffix}' + self.nopolyT = args.nopolyT + self.noLinker = args.noLinker + self.bool_probe = False + if args.probe_file and args.probe_file != 'None': + self.bool_probe = True + self.probe_count_dic = utils.genDict(dim=3) + self.valid_count_dic = utils.genDict(dim=2) + self.probe_dic = utils.read_fasta(args.probe_file) + self.reads_without_probe = 0 + self.pattern = args.pattern + self.linker = args.linker + self.whitelist = args.whitelist + self.lowNum = args.lowNum + self.lowQual = args.lowQual + self.allowNoPolyT = args.allowNoPolyT + self.allowNoLinker = args.allowNoLinker + + @utils.add_log + def run(self): + + outfq1 = xopen(self.out_fq1, 'w') + outfq2 = xopen(self.out_fq2, 'w') + + if self.nopolyT: + fh1_without_polyT = xopen(self.outdir + '/noPolyT_1.fq', 'w') + fh2_without_polyT = xopen(self.outdir + '/noPolyT_2.fq', 'w') + + if self.noLinker: + fh1_without_linker = xopen(self.outdir + '/noLinker_1.fq', 'w') + fh2_without_linker = xopen(self.outdir + '/noLinker_2.fq', 'w') + + for i in range(self.fq_number): + + chemistry = self.chemistry_list[i] + lowNum = int(self.lowNum) + Convert.run.logger.info(f'lowQual score: {self.lowQual}') + lowQual = int(self.lowQual) + if chemistry == 'scopeV1': + lowNum = min(0, lowNum) + lowQual = max(10, lowQual) + Convert.run.logger.info(f'scopeV1: lowNum={lowNum}, lowQual={lowQual} ') + # get linker and whitelist + bc_pattern = __PATTERN_DICT__[chemistry] + if (bc_pattern): + (linker, whitelist) = get_scope_bc(chemistry) + else: + bc_pattern = self.pattern + linker = self.linker + whitelist = self.whitelist + if not bc_pattern: + raise Exception("invalid bc_pattern!") + + # parse pattern to dict, C8L10C8L10C8U8 + # defaultdict(, {'C': [[0, 8], [18, 26], [36, 44]], 'U': + # [[44, 52]], 'L': [[8, 18], [26, 36]]}) + pattern_dict = parse_pattern(bc_pattern) + + bool_T = True if 'T' in pattern_dict else False + bool_L = True if 'L' in pattern_dict else False + bool_whitelist = (whitelist is not None) and whitelist != "None" + C_len = sum([item[1] - item[0] for item in pattern_dict['C']]) + + if bool_whitelist: + seq_list, _ = utils.read_one_col(whitelist) + barcode_correct_set, barcode_mismatch_dict = get_all_mismatch(seq_list, n_mismatch=1) + barcode_correct_set_list = [barcode_correct_set] * 3 + barcode_mismatch_dict_list = [barcode_mismatch_dict] * 3 + if bool_L: + seq_list, _ = utils.read_one_col(linker) + check_seq(linker, pattern_dict, "L") + linker_correct_set_list = [] + linker_mismatch_dict_list = [] + start = 0 + for item in pattern_dict['L']: + end = start + item[1] - item[0] + linker_seq_list = [seq[start:end] for seq in seq_list] + linker_correct_set, linker_mismatch_dict = get_all_mismatch(linker_seq_list, n_mismatch=2) + linker_correct_set_list.append(linker_correct_set) + linker_mismatch_dict_list.append(linker_mismatch_dict) + start = end + + fq1 = pysam.FastxFile(self.fq1_list[i], persist=False) + fq2 = pysam.FastxFile(self.fq2_list[i], persist=False) + + for entry1 in fq1: + entry2 = next(fq2) + header1, seq1, qual1 = entry1.name, entry1.sequence, entry1.quality + header2, seq2, qual2 = entry2.name, entry2.sequence, entry2.quality + self.total_num += 1 + + # polyT filter + if bool_T and (not self.allowNoPolyT): + polyT = seq_ranges(seq1, pattern_dict['T']) + if polyT.count('T') < MIN_T: + self.no_polyT_num += 1 + if self.nopolyT: + fh1_without_polyT.write( + '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1)) + fh2_without_polyT.write( + '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2)) + continue + + # lowQual filter + C_U_quals_ascii = seq_ranges( + qual1, pattern_dict['C'] + pattern_dict['U']) + # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii] + if lowQual > 0 and low_qual(C_U_quals_ascii, lowQual, lowNum): + self.lowQual_num += 1 + continue + + # linker filter + if bool_L and (not self.allowNoLinker): + seq_list = get_seq_list(seq1, pattern_dict, 'L') + bool_valid, bool_corrected, _ = check_seq_mismatch( + seq_list, linker_correct_set_list, linker_mismatch_dict_list) + if not bool_valid: + self.no_linker_num += 1 + if self.noLinker: + fh1_without_linker.write( + '@%s\n%s\n+\n%s\n' % (header1, seq1, qual1)) + fh2_without_linker.write( + '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2)) + continue + elif bool_corrected: + self.linker_corrected_num += 1 + + # barcode filter + seq_list = get_seq_list(seq1, pattern_dict, 'C') + if bool_whitelist: + bool_valid, bool_corrected, corrected_seq = check_seq_mismatch( + seq_list, barcode_correct_set_list, barcode_mismatch_dict_list) + + if not bool_valid: + self.no_barcode_num += 1 + continue + elif bool_corrected: + self.barcode_corrected_num += 1 + cb = corrected_seq + else: + cb = "".join(seq_list) + + umi = seq_ranges(seq1, pattern_dict['U']) + + self.clean_num += 1 + + if self.bool_probe: + # valid count + read_name_probe = 'None' + self.valid_count_dic[cb][umi] += 1 + + # output probe UMi and read count + find_probe = False + for probe_name in self.probe_dic: + probe_seq = self.probe_dic[probe_name] + probe_seq = probe_seq.upper() + if seq1.find(probe_seq) != -1: + self.probe_count_dic[probe_name][cb][umi] += 1 + read_name_probe = probe_name + find_probe = True + break + + if not find_probe: + self.reads_without_probe += 1 + + self.barcode_qual_Counter.update(C_U_quals_ascii[:C_len]) + self.umi_qual_Counter.update(C_U_quals_ascii[C_len:]) + + outfq1.write(f'@{header1}\n{cb}{umi}\n+\n{C_U_quals_ascii}\n') + + outfq2.write(f'@{header2}\n{seq2}\n+\n{qual2}\n') + + Convert.run.logger.info(self.fq1_list[i] + ' finished.') + outfq1.close() + outfq2.close() + + # logging + Convert.run.logger.info( + f'processed reads: {utils.format_number(self.total_num)}. ' + f'valid reads: {utils.format_number(self.clean_num)}. ' + ) + + Convert.run.logger.info(f'no polyT reads number : {self.no_polyT_num}') + Convert.run.logger.info(f'low qual reads number: {self.lowQual_num}') + Convert.run.logger.info(f'no_linker: {self.no_linker_num}') + Convert.run.logger.info(f'no_barcode: {self.no_barcode_num}') + Convert.run.logger.info(f'corrected linker: {self.linker_corrected_num}') + Convert.run.logger.info(f'corrected barcode: {self.barcode_corrected_num}') + + if self.clean_num == 0: + raise Exception( + 'no valid reads found! please check the --chemistry parameter.') + + if self.bool_probe: + # total probe summary + total_umi = 0 + total_valid_read = 0 + for cb in self.valid_count_dic: + total_umi += len(self.valid_count_dic[cb]) + total_valid_read += sum(self.valid_count_dic[cb].values()) + + # probe summary + count_list = [] + for probe_name in self.probe_dic: + UMI_count = 0 + read_count = 0 + if probe_name in self.probe_count_dic: + for cb in self.probe_count_dic[probe_name]: + UMI_count += len(self.probe_count_dic[probe_name][cb]) + read_count += sum(self.probe_count_dic[probe_name][cb].values()) + count_list.append( + {"probe_name": probe_name, "UMI_count": UMI_count, "read_count": read_count}) + + df_count = pd.DataFrame(count_list, columns=[ + "probe_name", "read_count", "UMI_count"]) + + def format_percent(x): + x = str(round(x*100, 2))+"%" + return x + df_count["read_fraction"] = ( + df_count["read_count"]/total_valid_read).apply(format_percent) + df_count["UMI_fraction"] = ( + df_count["UMI_count"]/total_umi).apply(format_percent) + df_count.sort_values(by="UMI_count", inplace=True, ascending=False) + df_count_file = self.outdir + '/' + self.sample + '_probe_count.tsv' + df_count.to_csv(df_count_file, sep="\t", index=False) + + # stat + BarcodesQ30 = sum([self.barcode_qual_Counter[k] for k in self.barcode_qual_Counter if k >= ord2chr( + 30)]) / float(sum(self.barcode_qual_Counter.values())) * 100 + UMIsQ30 = sum([self.umi_qual_Counter[k] for k in self.umi_qual_Counter if k >= ord2chr( + 30)]) / float(sum(self.umi_qual_Counter.values())) * 100 + + def cal_percent(x): return "{:.2%}".format((x + 0.0) / self.total_num) + stat_info = ''' + Raw Reads: %s + Valid Reads: %s(%s) + Q30 of Barcodes: %.2f%% + Q30 of UMIs: %.2f%% + ''' + with open(self.outdir + '/stat.txt', 'w') as fh: + stat_info = stat_info % (utils.format_number(self.total_num), utils.format_number(self.clean_num), + cal_percent(self.clean_num), BarcodesQ30, + UMIsQ30) + stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M) + fh.write(stat_info) + + # self.fastqc() + self.clean_up() + + +@utils.add_log +def convert(args): + step_name = "convert" + convert_obj = Convert(args, step_name) + convert_obj.run() + + +def get_opts_convert(parser, sub_program=True): + parser.add_argument('--pattern', help='') + parser.add_argument('--whitelist', help='') + parser.add_argument('--linker', help='') + parser.add_argument('--lowQual', type=int, + help='max phred of base as lowQual, default=0', default=0) + parser.add_argument( + '--lowNum', type=int, help='max number with lowQual allowed, default=2', default=2) + parser.add_argument('--nopolyT', action='store_true', + help='output nopolyT fq') + parser.add_argument('--noLinker', action='store_true', + help='output noLinker fq') + parser.add_argument('--probe_file', help="probe fasta file") + parser.add_argument('--allowNoPolyT', help="allow reads without polyT", action='store_true') + parser.add_argument('--allowNoLinker', help="allow reads without correct linker", action='store_true') + parser.add_argument('--gzip', help="output gzipped fastq", action='store_true') + parser.add_argument( + '--chemistry', choices=__PATTERN_DICT__.keys(), help='chemistry version', default='auto') + if sub_program: + parser.add_argument('--fq1', help='read1 fq file', required=True) + parser.add_argument('--fq2', help='read2 fq file', required=True) + parser = s_common(parser) + + return parser + diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py index 93ecdedb..8c89ad45 100644 --- a/celescope/trust_vdj/multi_trust_vdj.py +++ b/celescope/trust_vdj/multi_trust_vdj.py @@ -4,11 +4,22 @@ from celescope.tools.Multi import Multi class Multi_trust_vdj(Multi): + def convert(self, sample): + step = 'convert' + arr = self.fq_dict[sample] + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--fq1 {arr[0]} --fq2 {arr[1]} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=1) + + def trust_assemble(self, sample): step = 'trust_assemble' cmd_line = self.get_cmd_line(step, sample) - fq1 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R1.fq{self.fq_suffix}' - fq2 = f'{self.outdir_dic[sample]["barcode"]}/{sample}_new_R2.fq{self.fq_suffix}' + fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}' + fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' cmd = ( f'{cmd_line} ' f'--fq1 {fq1} ' -- Gitee From 5bc30611654a630c56c183f59006fe5880af037e Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 16 Jun 2021 14:44:45 +0800 Subject: [PATCH 44/96] refactor variant calling --- celescope/__init__.py | 6 + celescope/snp/__init__.py | 2 +- celescope/snp/multi_snp.py | 10 +- celescope/snp/snpCalling.py | 43 -- celescope/snp/variant_calling.py | 430 ++++++++++++++++++ .../html/snp/snpCalling_summary.html | 37 -- .../html/snp/variant_calling_summary.html | 36 ++ .../tests/{func_tests.py => test_function.py} | 3 + celescope/tools/step.py | 9 +- celescope/tools/target_metrics.py | 5 +- celescope/tools/utils.py | 41 +- 11 files changed, 509 insertions(+), 113 deletions(-) delete mode 100755 celescope/snp/snpCalling.py create mode 100755 celescope/snp/variant_calling.py delete mode 100755 celescope/templates/html/snp/snpCalling_summary.html create mode 100644 celescope/templates/html/snp/variant_calling_summary.html rename celescope/tests/{func_tests.py => test_function.py} (90%) diff --git a/celescope/__init__.py b/celescope/__init__.py index 28a2454c..bea81b71 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -21,3 +21,9 @@ ASSAY_DICT = { ROOT_PATH = os.path.dirname(__file__) RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ] + +HELP_DICT = { + 'match_dir': 'Match celescope scRNA-Seq directory.', + 'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.', + +} diff --git a/celescope/snp/__init__.py b/celescope/snp/__init__.py index ef961a09..c9f17e51 100755 --- a/celescope/snp/__init__.py +++ b/celescope/snp/__init__.py @@ -1,7 +1,7 @@ __STEPS__ = [ 'mkref', 'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts', - 'target_metrics', 'snpCalling', 'analysis_snp' + 'target_metrics', 'variant_calling', 'analysis_snp' ] __ASSAY__ = 'snp' IMPORT_DICT = { diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index 8c692773..89781522 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -31,8 +31,8 @@ class Multi_snp(Multi): self.process_cmd(cmd, step, sample, m=2, x=1) - def snpCalling(self, sample): - step = 'snpCalling' + def variant_calling(self, sample): + step = 'variant_calling' cmd_line = self.get_cmd_line(step, sample) bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered.bam' cmd = ( @@ -44,9 +44,9 @@ class Multi_snp(Multi): def analysis_snp(self, sample): step = 'analysis_snp' - vcf = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_merged.vcf' - CID_file = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_CID.tsv' - variant_count_file = f'{self.outdir_dic[sample]["snpCalling"]}/{sample}_variant_count.tsv' + vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_merged.vcf' + CID_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_CID.tsv' + variant_count_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_variant_count.tsv' cmd_line = self.get_cmd_line(step, sample) cmd = ( f'{cmd_line} ' diff --git a/celescope/snp/snpCalling.py b/celescope/snp/snpCalling.py deleted file mode 100755 index 35802ecc..00000000 --- a/celescope/snp/snpCalling.py +++ /dev/null @@ -1,43 +0,0 @@ -import os - -from mutract.utils import Mutract - -import celescope.tools.utils as utils -from celescope.tools.step import s_common - - -@utils.add_log -def snpCalling(args): - - sample = args.sample - outdir = args.outdir - thread = int(args.thread) - match_dir = args.match_dir - bam = args.bam - genomeDir = args.genomeDir - gene_list_file = args.gene_list - - # check dir - if not os.path.exists(outdir): - os.system('mkdir -p %s' % (outdir)) - - # get args - _refFlat, _gtf, fasta = utils.glob_genomeDir(genomeDir, fa=True) - _match_barcode, (_cell_total, match_barcode_file) = utils.read_barcode_file(match_dir, return_file=True) - - # mutract - obj = Mutract( - outdir, sample, bam, fasta, - match_barcode_file, thread=thread, gene_file=gene_list_file - ) - obj.run() - - -def get_opts_snpCalling(parser, sub_program): - if sub_program: - s_common(parser) - parser.add_argument("--bam", help='featureCounts bam', required=True) - parser.add_argument( - "--match_dir", help="match scRNA-Seq dir", required=True) - parser.add_argument("--genomeDir", help='genomeDir', required=True) - parser.add_argument("--gene_list", help='gene_list', required=True) diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py new file mode 100755 index 00000000..ec5e7c02 --- /dev/null +++ b/celescope/snp/variant_calling.py @@ -0,0 +1,430 @@ +import logging +import os +import subprocess +from collections import defaultdict +from concurrent.futures import ProcessPoolExecutor + +import pandas as pd +import pysam +from scipy.io import mmwrite +from scipy.sparse import coo_matrix + +import celescope.tools.utils as utils +from celescope.__init__ import HELP_DICT +from celescope.tools.step import Step, s_common +from celescope.rna.mkref import parse_genomeDir_rna + + +class Variant_calling(Step): + """ + Features + - Perform variant calling + + Output + + `{sample}_VID.tsv` A unique numeric ID is assigned for each variant. + + `{sample}_CID.tsv` A unique numeric ID is assigned for each cell. + + `{sample}_variant_count.tsv` Reference and variant supporting reads/UMIs count. + + `{sample}_support.mtx` Support matrix, only high quality bases are considered. + 0 : no reads/UMIs cover the position. + 1 : all reads/UMIs at the position support the ref allele. + 2 : all reads/UMIs at the position support the alt allele. + 3 : one or more reads/UMIs support both the alt and the ref allele. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + # set + self.barcodes, _num = utils.read_barcode_file(args.match_dir) + self.fasta = parse_genomeDir_rna(args.genomeDir)['fasta'] + if args.vcf: + self.vcf_bool = True + self.vcf = args.vcf + else: + self.vcf_bool = False + self.vcf = None + self.df_vcf = None + + # out + self.splitN_bam = f'{self.out_prefix}_splitN.bam' + self.CID_file = f'{self.out_prefix}_CID.tsv' + self.VID_file = f'{self.out_prefix}_VID.tsv' + self.VID_vcf_file = f'{self.out_prefix}_VID.vcf' + self.variant_count_file = f'{self.out_prefix}_variant_count.tsv' + self.ref_mtx_file = f'{self.out_prefix}_ref.mtx' + self.alt_mtx_file = f'{self.out_prefix}_alt.mtx' + self.support_matrix_file = f'{self.out_prefix}_support.mtx' + + + @utils.add_log + def SplitNCigarReads(self): + cmd = ( + f'gatk ' + f'SplitNCigarReads ' + f'-R {self.fasta} ' + f'-I {self.args.bam} ' + f'-O {self.splitN_bam} ' + ) + Variant_calling.SplitNCigarReads.logger.info(cmd) + subprocess.check_call(cmd, shell=True) + + + @utils.add_log + def split_bam(self): + ''' + input: + bam: bam from splitN + barcodes: cell barcodes, list + ouput: + bam_dict: assign reads to cell barcodes and UMI + count_dict: UMI counts per cell + CID: assign ID(1-based) to cells + ''' + + # init + bam_dict = defaultdict(list) + CID_dict = defaultdict(dict) + cells_dir = f'{self.outdir}/cells/' + + # read bam and split + samfile = pysam.AlignmentFile(self.args.bam, "rb") + header = samfile.header + for read in samfile: + attr = read.query_name.split('_') + barcode = attr[0] + if barcode in self.barcodes: + CID = self.barcodes.index(barcode) + 1 + read.set_tag(tag='CL', value=f'CELL{CID}', value_type='Z') + + # assign read to barcode + bam_dict[barcode].append(read) + + + self.split_bam.logger.info('writing cell bam...') + # write new bam + CID = 0 + for barcode in self.barcodes: + # init + CID += 1 + CID_dict[CID]['barcode'] = barcode + CID_dict[CID]['valid'] = False + + # out bam + if barcode in bam_dict: + cell_dir = f'{cells_dir}/cell{CID}' + cell_bam_file = f'{cell_dir}/cell{CID}.bam' + if not os.path.exists(cell_dir): + os.makedirs(cell_dir) + CID_dict[CID]['valid'] = True + cell_bam = pysam.AlignmentFile( + f'{cell_bam_file}', "wb", header=header) + for read in bam_dict[barcode]: + cell_bam.write(read) + cell_bam.close() + + # out CID + df_CID = pd.DataFrame(CID_dict).T + df_CID.index.name = 'CID' + df_CID.to_csv(self.CID_file, sep='\t') + + @utils.add_log + def call_snp(self, CID): + + self.call_snp.logger.info('Processing Cell {}' % CID) + bam = f'{self.outdir}/cells/cell{CID}/cell{CID}.bam' + # sort + sorted_bam = f'{self.outdir}/cells/cell{CID}/cell{CID}_sorted.bam' + cmd_sort = ( + f'samtools sort {bam} -o {sorted_bam}' + ) + subprocess.check_call(cmd_sort, shell=True) + + # mpileup + bcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.bcf' + cmd_mpileup = ( + f'bcftools mpileup -Ou ' + f'-f {self.fasta} ' + f'{sorted_bam} -o {bcf} ' + ) + subprocess.check_call(cmd_mpileup, shell=True) + + # call + out_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.vcf' + cmd_call = ( + f'bcftools call -mv -Ov ' + f'-o {out_vcf} ' + f'{bcf}' + f'>/dev/null 2>&1 ' + ) + subprocess.check_call(cmd_call, shell=True) + + # norm + norm_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' + cmd_norm = ( + f'bcftools norm -d none ' + f'-f {self.fasta} ' + f'{out_vcf} ' + f'-o {norm_vcf} ' + ) + subprocess.check_call(cmd_norm, shell=True) + + # call all position + out_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all.vcf' + cmd_all_call = ( + f'bcftools call -m -Ov ' + f'-o {out_all_vcf} ' + f'{bcf}' + f'>/dev/null 2>&1 ' + ) + subprocess.check_call(cmd_all_call, shell=True) + + # norm all + norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' + cmd_all_norm = ( + f'bcftools norm -d none ' + f'-f {self.fasta} ' + f'{out_all_vcf} ' + f'-o {norm_all_vcf} ' + ) + subprocess.check_call(cmd_all_norm, shell=True) + + def call_all_snp(self): + all_res = [] + _df_index, df_valid = self.read_CID() + CID_arg = df_valid.index + with ProcessPoolExecutor(self.thread) as pool: + for res in pool.map(self.call_snp, CID_arg): + all_res.append(res) + + def read_CID(self): + df_index = pd.read_csv(self.CID_file, sep='\t', index_col=0, dtype=object) + df_valid = df_index[df_index['valid'] == 'True'] + return df_index, df_valid + + @staticmethod + def _parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): + ''' + parse vcf into df + ''' + vcf = pysam.VariantFile(vcf_file) + df = pd.DataFrame(columns=[col for col in cols] + infos) + rec_dict = {} + for rec in vcf.fetch(): + + for col in cols: + rec_dict[col] = getattr(rec, col) + # if ref == alt: alleles=(ref,) + # else alleles=(ref, alt) + if col == 'alleles': + rec_dict['ref'] = rec_dict['alleles'][0] + rec_dict['alt'] = '.' + if len(rec_dict['alleles']) == 2: + rec_dict['alt'] = rec_dict['alleles'][1] + + for info in infos: + rec_dict[info] = rec.info[info] + + df = df.append(pd.Series(rec_dict),ignore_index=True) + return df + + def parse_vcf(self): + self.df_vcf = self._parse_vcf(self.vcf_file) + + def merge_vcf(self): + ''' + merge cell vcf into one non-duplicated vcf + add VID(variant ID) and CID(cell ID) + ''' + _df_index, df_valid = self.read_CID() + CIDs = df_valid.index + + # variant dict + v_cols = ['chrom', 'pos', 'alleles'] + v_dict = {} + + for CID in CIDs: + CID = str(CID) + vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' + vcf = pysam.VariantFile(vcf_file,'r') + for rec in vcf.fetch(): + v = ','.join([str(getattr(rec, col)) for col in v_cols]) + if not v in v_dict: + v_dict[v] = dict() + v_dict[v]['CID'] = [CID] + v_dict[v]['record'] = rec + else: + v_dict[v]['CID'].append(CID) + + # output + def get_vcf_header(CIDs): + CID = CIDs[0] + vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' + vcf = pysam.VariantFile(vcf_file,'r') + return vcf.header + vcf_header = get_vcf_header(CIDs) + merged_vcf_file = f'{self.outdir}/{self.sample}_merged.vcf' + vcf_header.info.add('VID', number=1, type='String', description='Variant ID') + vcf_header.info.add('CID', number=1, type='String', description='Cell ID') + merged_vcf = pysam.VariantFile(merged_vcf_file,'w', header=vcf_header) + + VID = 0 + for v in sorted(v_dict.keys()): + VID += 1 + rec = v_dict[v]['record'] + CID = ','.join(v_dict[v]['CID']) + record = merged_vcf.new_record() + cols = ['chrom', 'pos', 'alleles'] + for col in cols: + setattr(record,col, getattr(rec,col)) + record.info['VID'] = str(VID) + record.info['CID'] = CID + merged_vcf.write(record) + + merged_vcf.close() + self.vcf = merged_vcf_file + + + def write_VID_file(self): + df_VID = self.df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']] + df_VID.to_csv(self.VID_file, sep='\t', index=False) + + + def add_VID(self): + vcf = pysam.VariantFile(self.vcf,'r') + vcf_header = vcf.header + if 'VID' in vcf_header.info: + logging.info('VID is already in vcf file!') + return + vcf_header.info.add('VID', number=1, type='String', description='Variant ID') + VID_vcf = pysam.VariantFile(self.VID_vcf_file, 'w', header=vcf_header) + VID = 0 + for rec in vcf.fetch(): + VID += 1 + rec.info['VID'] = str(VID) + VID_vcf.write(rec) + VID_vcf.close() + self.vcf = self.VID_vcf_file + + @utils.add_log + def cell_UMI(self, CID): + df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count']) + norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' + df_cell_vcf = self._parse_vcf(norm_all_vcf, infos=['DP4']) + + def get_DP4(row, alt): + DP4 = row['DP4'].iloc[0] + if alt == 'ref': + indexs = [0,1] + elif alt == 'alt': + indexs = [2,3] + umi = sum([DP4[index] for index in indexs]) + return umi + + def map_vcf_row(row, df_cell_vcf): + pos = row['pos'] + chrom = row['chrom'] + alt = row['alt'] + df_pos = df_cell_vcf[(df_cell_vcf['pos']==pos) & (df_cell_vcf['chrom']==chrom)] + df_ref = df_pos[df_pos['alt']=='.'] + df_alt = df_pos[df_pos['alt']==alt] + ref_UMI = 0 + alt_UMI = 0 + if df_ref.shape[0] != 0: + ref_UMI = get_DP4(df_ref, 'ref') + if df_alt.shape[0] != 0: + alt_UMI = get_DP4(df_alt, 'alt') + return ref_UMI, alt_UMI, pos, chrom, alt + + for index in self.df_vcf.index: + row = self.df_vcf.loc[index,] + ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf) + if (ref_UMI + alt_UMI) != 0: + VID = row['VID'] + dic = { + 'VID':VID, + 'CID':CID, + 'ref_count':ref_UMI, + 'alt_count':alt_UMI, + } + df_UMI = df_UMI.append(dic, ignore_index=True) + return df_UMI + + @utils.add_log + def get_UMI(self): + ''' + get variant and ref UMI supporting an allele + ''' + _df_index, df_valid = self.read_CID() + + df_UMI_list = [] + with ProcessPoolExecutor(self.thread) as pool: + for res in pool.map(self.cell_UMI, list(df_valid.index)): + df_UMI_list.append(res) + + df_UMI = pd.concat(df_UMI_list) + df_UMI['VID'] = df_UMI['VID'].astype('int') + df_UMI.sort_values(by=['VID','CID'], inplace=True) + df_UMI.to_csv(self.variant_count_file, sep='\t', index=False) + + def write_support_matrix(self): + def set_support_bit(row): + ref_bit = 1 if row['ref_count'] > 0 else 0 + alt_bit = 2 if row['alt_count'] > 0 else 0 + support_bit = ref_bit + alt_bit + return support_bit + + df_variant_count = pd.read_csv(self.variant_count_file, sep='\t') + df_variant_count['support'] = self.df_variant_count.apply(set_support_bit, axis=1) + support_mtx = coo_matrix( + (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1)) + ) + mmwrite(self.support_mtx_file, support_mtx) + + + def run(self): + self.SplitNCigarReads() + self.split_bam() + self.call_all_snp() + if self.vcf_bool: + self.add_VID() + else: + self.merge_vcf() + self.parse_vcf() + self.write_VID_file() + self.get_UMI() + self.write_support_matrix() + + +@utils.add_log +def variant_calling(args): + + step_name = 'variant_calling' + variant_calling_obj = Variant_calling(args, step_name) + variant_calling_obj.run() + + +def get_opts_variant_calling(parser, sub_program): + + parser.add_argument("--genomeDir", help='Genome directory', required=True) + parser.add_argument( + "--vcf", + help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level +and use these variants as input vcf.""", + required=False + ) + if sub_program: + parser.add_argument( + "--bam", + help='Input BAM file from step `target_metrics`. ', + required=True + ) + parser.add_argument( + "--match_dir", + help=HELP_DICT['match_dir'], + required=True + ) + s_common(parser) diff --git a/celescope/templates/html/snp/snpCalling_summary.html b/celescope/templates/html/snp/snpCalling_summary.html deleted file mode 100755 index 9c63da39..00000000 --- a/celescope/templates/html/snp/snpCalling_summary.html +++ /dev/null @@ -1,37 +0,0 @@ -
-

Variant Calling

- -
- -
- {% for item in snpCalling_summary %} - {% if loop.index <= (loop.length+1)/2 %} - - {% for i in item %} - - {% endfor %} - - {% endif %} - {% endfor %} -
{{ i|e }}
- - - {% for item in snpCalling_summary %} - {% if loop.index > (loop.length+1)/2 %} - - {% for i in item %} - - {% endfor %} - - {% endif %} - {% endfor %} -
{{ i|e }}
-
-
- - diff --git a/celescope/templates/html/snp/variant_calling_summary.html b/celescope/templates/html/snp/variant_calling_summary.html new file mode 100644 index 00000000..0cb55764 --- /dev/null +++ b/celescope/templates/html/snp/variant_calling_summary.html @@ -0,0 +1,36 @@ +
+

Variant Calling

+ +
+ + + {% for item in variant_calling_summary %} + {% if loop.index <= (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+ + + {% for item in variant_calling_summary %} + {% if loop.index > (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+
+
+
\ No newline at end of file diff --git a/celescope/tests/func_tests.py b/celescope/tests/test_function.py similarity index 90% rename from celescope/tests/func_tests.py rename to celescope/tests/test_function.py index ecc0601e..8936c9aa 100755 --- a/celescope/tests/func_tests.py +++ b/celescope/tests/test_function.py @@ -10,6 +10,7 @@ class Tests(unittest.TestCase): def setUp(self): pass + @unittest.skip("tested") def test_stat_to_metric(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna') args_dict = { @@ -26,3 +27,5 @@ class Tests(unittest.TestCase): obj.stat_to_metric() print(obj.content_dict['metric']) + def test_test(self): + assert 0 == 0 \ No newline at end of file diff --git a/celescope/tools/step.py b/celescope/tools/step.py index 43f981f7..0c759de3 100755 --- a/celescope/tools/step.py +++ b/celescope/tools/step.py @@ -12,6 +12,8 @@ from jinja2 import Environment, FileSystemLoader, select_autoescape from celescope.tools.utils import add_log +Metric = namedtuple("Metric", "name value total fraction") + def s_common(parser): """subparser common arguments """ @@ -33,7 +35,7 @@ class Step: self.outdir = args.outdir self.sample = args.sample self.assay = args.assay - self.thread = args.thread + self.thread = int(args.thread) self.debug = args.debug # set self.out_prefix = f'{self.outdir}/{self.sample}' @@ -43,7 +45,6 @@ class Step: os.system('mkdir -p %s' % self.outdir) self.metric_list = [] - self.Metric = namedtuple("Metric", "name value total fraction") self.path_dict = { "metric": f'{self.outdir}/../.metrics.json', "data": f'{self.outdir}/../.data.json' @@ -68,7 +69,7 @@ class Step: def add_metric(self, name, value=None, total=None, fraction=None): '''add metric to metric_list ''' - self.metric_list.append(self.Metric( + self.metric_list.append(Metric( name=name, value=value, total=total, fraction=fraction )) @@ -83,7 +84,7 @@ class Step: fraction = metric.value / metric.total if fraction: fraction = round(fraction, 4) - metric_list.append(self.Metric( + metric_list.append(Metric( name=metric.name, value=metric.value, total=metric.total, diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py index ed824222..1ac9dfed 100755 --- a/celescope/tools/target_metrics.py +++ b/celescope/tools/target_metrics.py @@ -4,6 +4,7 @@ import pysam import celescope.tools.utils as utils from celescope.tools.step import Step, s_common +from celescope.__init__ import HELP_DICT class Target_metrics(Step): @@ -86,9 +87,9 @@ def target_metrics(args): def get_opts_target_metrics(parser, sub_program): + parser.add_argument("--gene_list", help=HELP_DICT['gene_list'], required=True) if sub_program: - parser = s_common(parser) parser.add_argument("--bam", help='featureCounts bam', required=True) parser.add_argument('--match_dir', help='match_dir', required=True) - parser.add_argument("--gene_list", help='gene_list', required=True) + parser = s_common(parser) diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 6baa77b6..6741903a 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -97,27 +97,6 @@ def arg_str(arg, arg_name): return '' -def read_barcode_file(match_dir, return_file=False): - ''' - multi version compatible - ''' - match_barcode_file1 = glob.glob( - f"{match_dir}/*count*/*_cellbarcode.tsv") - match_barcode_file2 = glob.glob( - f"{match_dir}/*count*/*matrix_10X/*_cellbarcode.tsv") - match_barcode_file3 = glob.glob( - f"{match_dir}/*count*/*matrix_10X/*barcodes.tsv") - match_barcode_file = ( - match_barcode_file1 + - match_barcode_file2 + - match_barcode_file3)[0] - match_barcode, cell_total = read_one_col(match_barcode_file) - match_barcode = set(match_barcode) - if return_file: - return match_barcode, (cell_total, match_barcode_file) - return match_barcode, cell_total - - def format_stat(count, total_count): percent = round(count / total_count * 100, 2) string = f'{format_number(count)}({percent}%)' @@ -696,6 +675,26 @@ def parse_annovar(annovar_file): return df +def read_barcode_file(match_dir, return_file=False): + ''' + multi version compatible + ''' + match_barcode_file1 = glob.glob( + f"{match_dir}/*count*/*_cellbarcode.tsv") + match_barcode_file2 = glob.glob( + f"{match_dir}/*count*/*matrix_10X/*_cellbarcode.tsv") + match_barcode_file3 = glob.glob( + f"{match_dir}/*count*/*matrix_10X/*barcodes.tsv") + match_barcode_file = ( + match_barcode_file1 + + match_barcode_file2 + + match_barcode_file3)[0] + match_barcode, cell_total = read_one_col(match_barcode_file) + if return_file: + return match_barcode, (cell_total, match_barcode_file) + return match_barcode, cell_total + + def parse_match_dir(match_dir): match_dict = {} match_barcode, cell_total = read_barcode_file(match_dir) -- Gitee From 3b14e7bc6e1a3b9bb01d7d33b1664b293cb268a1 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 16 Jun 2021 18:08:35 +0800 Subject: [PATCH 45/96] tested --- .gitignore | 3 + celescope/snp/multi_snp.py | 2 +- celescope/snp/tests/test_variant_calling.py | 38 ++++++ celescope/snp/variant_calling.py | 138 ++++++++++---------- celescope/tools/multi.py | 2 +- release_local.py | 4 +- 6 files changed, 116 insertions(+), 71 deletions(-) create mode 100644 celescope/snp/tests/test_variant_calling.py diff --git a/.gitignore b/.gitignore index be308f44..30d2a341 100755 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# test output +test_output/ + # vscode .vscode/ diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index 89781522..ea96c69c 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -44,7 +44,7 @@ class Multi_snp(Multi): def analysis_snp(self, sample): step = 'analysis_snp' - vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_merged.vcf' + vcf = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}.vcf' CID_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_CID.tsv' variant_count_file = f'{self.outdir_dic[sample]["variant_calling"]}/{sample}_variant_count.tsv' cmd_line = self.get_cmd_line(step, sample) diff --git a/celescope/snp/tests/test_variant_calling.py b/celescope/snp/tests/test_variant_calling.py new file mode 100644 index 00000000..5d612b00 --- /dev/null +++ b/celescope/snp/tests/test_variant_calling.py @@ -0,0 +1,38 @@ +import unittest +import os +from collections import namedtuple +from celescope.snp.variant_calling import Variant_calling + +ROOT_DIR = os.path.dirname(__file__) + +class Test_variant_calling(unittest.TestCase): + def setUp(self): + os.chdir(ROOT_DIR) + Args = namedtuple("Args", "thread outdir sample assay debug " + "genomeDir vcf bam match_dir") + self.args = Args( + thread=10, + outdir="./test_output/07.variant_calling", + sample="test1", + assay="snp", + debug=False, + genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92", + vcf=None, + bam="./test_data/06.target_metrics/subset_filter.bam", + match_dir="./test_data/match_dir", + ) + + def test_run(self): + obj = Variant_calling(self.args, "variant_calling") + ''' + obj.SplitNCigarReads() + obj.split_bam() + obj.call_all_snp() + if obj.vcf_bool: + obj.add_VID() + else: + obj.merge_vcf() + ''' + obj.write_VID_file() + obj.get_UMI() + obj.write_support_matrix() + obj.clean_up() \ No newline at end of file diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index ec5e7c02..2edc3e24 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -15,6 +15,33 @@ from celescope.tools.step import Step, s_common from celescope.rna.mkref import parse_genomeDir_rna + +def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): + ''' + parse vcf into df + ''' + vcf = pysam.VariantFile(vcf_file) + df = pd.DataFrame(columns=list(cols) + list(infos)) + rec_dict = {} + for rec in vcf.fetch(): + + for col in cols: + rec_dict[col] = getattr(rec, col) + # if ref == alt: alleles=(ref,) + # else alleles=(ref, alt) + if col == 'alleles': + rec_dict['ref'] = rec_dict['alleles'][0] + rec_dict['alt'] = '.' + if len(rec_dict['alleles']) == 2: + rec_dict['alt'] = rec_dict['alleles'][1] + + for info in infos: + rec_dict[info] = rec.info[info] + + df = df.append(pd.Series(rec_dict),ignore_index=True) + return df + + class Variant_calling(Step): """ Features @@ -43,20 +70,16 @@ class Variant_calling(Step): self.fasta = parse_genomeDir_rna(args.genomeDir)['fasta'] if args.vcf: self.vcf_bool = True - self.vcf = args.vcf else: self.vcf_bool = False - self.vcf = None self.df_vcf = None # out self.splitN_bam = f'{self.out_prefix}_splitN.bam' self.CID_file = f'{self.out_prefix}_CID.tsv' self.VID_file = f'{self.out_prefix}_VID.tsv' - self.VID_vcf_file = f'{self.out_prefix}_VID.vcf' + self.final_vcf_file = f'{self.out_prefix}.vcf' self.variant_count_file = f'{self.out_prefix}_variant_count.tsv' - self.ref_mtx_file = f'{self.out_prefix}_ref.mtx' - self.alt_mtx_file = f'{self.out_prefix}_alt.mtx' self.support_matrix_file = f'{self.out_prefix}_support.mtx' @@ -91,7 +114,7 @@ class Variant_calling(Step): cells_dir = f'{self.outdir}/cells/' # read bam and split - samfile = pysam.AlignmentFile(self.args.bam, "rb") + samfile = pysam.AlignmentFile(self.splitN_bam, "rb") header = samfile.header for read in samfile: attr = read.query_name.split('_') @@ -131,29 +154,30 @@ class Variant_calling(Step): df_CID.index.name = 'CID' df_CID.to_csv(self.CID_file, sep='\t') + @staticmethod @utils.add_log - def call_snp(self, CID): + def call_snp(CID, outdir, fasta): - self.call_snp.logger.info('Processing Cell {}' % CID) - bam = f'{self.outdir}/cells/cell{CID}/cell{CID}.bam' + Variant_calling.call_snp.logger.info('Processing Cell %s' % CID) + bam = f'{outdir}/cells/cell{CID}/cell{CID}.bam' # sort - sorted_bam = f'{self.outdir}/cells/cell{CID}/cell{CID}_sorted.bam' + sorted_bam = f'{outdir}/cells/cell{CID}/cell{CID}_sorted.bam' cmd_sort = ( f'samtools sort {bam} -o {sorted_bam}' ) subprocess.check_call(cmd_sort, shell=True) # mpileup - bcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.bcf' + bcf = f'{outdir}/cells/cell{CID}/cell{CID}.bcf' cmd_mpileup = ( f'bcftools mpileup -Ou ' - f'-f {self.fasta} ' + f'-f {fasta} ' f'{sorted_bam} -o {bcf} ' ) subprocess.check_call(cmd_mpileup, shell=True) # call - out_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}.vcf' + out_vcf = f'{outdir}/cells/cell{CID}/cell{CID}.vcf' cmd_call = ( f'bcftools call -mv -Ov ' f'-o {out_vcf} ' @@ -163,17 +187,17 @@ class Variant_calling(Step): subprocess.check_call(cmd_call, shell=True) # norm - norm_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' + norm_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_norm.vcf' cmd_norm = ( f'bcftools norm -d none ' - f'-f {self.fasta} ' + f'-f {fasta} ' f'{out_vcf} ' f'-o {norm_vcf} ' ) subprocess.check_call(cmd_norm, shell=True) # call all position - out_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all.vcf' + out_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all.vcf' cmd_all_call = ( f'bcftools call -m -Ov ' f'-o {out_all_vcf} ' @@ -183,21 +207,24 @@ class Variant_calling(Step): subprocess.check_call(cmd_all_call, shell=True) # norm all - norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' + norm_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' cmd_all_norm = ( f'bcftools norm -d none ' - f'-f {self.fasta} ' + f'-f {fasta} ' f'{out_all_vcf} ' f'-o {norm_all_vcf} ' ) subprocess.check_call(cmd_all_norm, shell=True) + @utils.add_log def call_all_snp(self): all_res = [] _df_index, df_valid = self.read_CID() CID_arg = df_valid.index + outdir_arg = [self.outdir] * len(CID_arg) + fasta_arg = [self.fasta] * len(CID_arg) with ProcessPoolExecutor(self.thread) as pool: - for res in pool.map(self.call_snp, CID_arg): + for res in pool.map(self.call_snp, CID_arg, outdir_arg, fasta_arg): all_res.append(res) def read_CID(self): @@ -205,37 +232,11 @@ class Variant_calling(Step): df_valid = df_index[df_index['valid'] == 'True'] return df_index, df_valid - @staticmethod - def _parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): - ''' - parse vcf into df - ''' - vcf = pysam.VariantFile(vcf_file) - df = pd.DataFrame(columns=[col for col in cols] + infos) - rec_dict = {} - for rec in vcf.fetch(): - - for col in cols: - rec_dict[col] = getattr(rec, col) - # if ref == alt: alleles=(ref,) - # else alleles=(ref, alt) - if col == 'alleles': - rec_dict['ref'] = rec_dict['alleles'][0] - rec_dict['alt'] = '.' - if len(rec_dict['alleles']) == 2: - rec_dict['alt'] = rec_dict['alleles'][1] - - for info in infos: - rec_dict[info] = rec.info[info] - - df = df.append(pd.Series(rec_dict),ignore_index=True) - return df - - def parse_vcf(self): - self.df_vcf = self._parse_vcf(self.vcf_file) + @utils.add_log def merge_vcf(self): ''' + if vcf not provided, merge cell vcf into one non-duplicated vcf add VID(variant ID) and CID(cell ID) ''' @@ -266,10 +267,9 @@ class Variant_calling(Step): vcf = pysam.VariantFile(vcf_file,'r') return vcf.header vcf_header = get_vcf_header(CIDs) - merged_vcf_file = f'{self.outdir}/{self.sample}_merged.vcf' vcf_header.info.add('VID', number=1, type='String', description='Variant ID') vcf_header.info.add('CID', number=1, type='String', description='Cell ID') - merged_vcf = pysam.VariantFile(merged_vcf_file,'w', header=vcf_header) + merged_vcf = pysam.VariantFile(self.final_vcf_file,'w', header=vcf_header) VID = 0 for v in sorted(v_dict.keys()): @@ -283,37 +283,37 @@ class Variant_calling(Step): record.info['VID'] = str(VID) record.info['CID'] = CID merged_vcf.write(record) - merged_vcf.close() - self.vcf = merged_vcf_file - + @utils.add_log def write_VID_file(self): - df_VID = self.df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']] + df_vcf = parse_vcf(self.final_vcf_file) + df_VID = df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']] df_VID.to_csv(self.VID_file, sep='\t', index=False) - + @utils.add_log def add_VID(self): - vcf = pysam.VariantFile(self.vcf,'r') + vcf = pysam.VariantFile(self.args.vcf,'r') vcf_header = vcf.header if 'VID' in vcf_header.info: logging.info('VID is already in vcf file!') return vcf_header.info.add('VID', number=1, type='String', description='Variant ID') - VID_vcf = pysam.VariantFile(self.VID_vcf_file, 'w', header=vcf_header) + VID_vcf = pysam.VariantFile(self.final_vcf_file, 'w', header=vcf_header) VID = 0 for rec in vcf.fetch(): VID += 1 rec.info['VID'] = str(VID) VID_vcf.write(rec) VID_vcf.close() - self.vcf = self.VID_vcf_file + @staticmethod @utils.add_log - def cell_UMI(self, CID): + def cell_UMI(CID, outdir, final_vcf_file): + df_vcf = parse_vcf(final_vcf_file) df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count']) - norm_all_vcf = f'{self.outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' - df_cell_vcf = self._parse_vcf(norm_all_vcf, infos=['DP4']) + norm_all_vcf = f'{outdir}/cells/cell{CID}/cell{CID}_all_norm.vcf' + df_cell_vcf = parse_vcf(norm_all_vcf, infos=['DP4']) def get_DP4(row, alt): DP4 = row['DP4'].iloc[0] @@ -339,8 +339,8 @@ class Variant_calling(Step): alt_UMI = get_DP4(df_alt, 'alt') return ref_UMI, alt_UMI, pos, chrom, alt - for index in self.df_vcf.index: - row = self.df_vcf.loc[index,] + for index in df_vcf.index: + row = df_vcf.loc[index,] ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf) if (ref_UMI + alt_UMI) != 0: VID = row['VID'] @@ -361,15 +361,19 @@ class Variant_calling(Step): _df_index, df_valid = self.read_CID() df_UMI_list = [] + CID_arg = list(df_valid.index) + outdir_arg = [self.outdir] * len(CID_arg) + final_vcf_file_arg = [self.final_vcf_file] * len(CID_arg) with ProcessPoolExecutor(self.thread) as pool: - for res in pool.map(self.cell_UMI, list(df_valid.index)): + for res in pool.map(Variant_calling.cell_UMI, CID_arg, outdir_arg, final_vcf_file_arg): df_UMI_list.append(res) df_UMI = pd.concat(df_UMI_list) df_UMI['VID'] = df_UMI['VID'].astype('int') df_UMI.sort_values(by=['VID','CID'], inplace=True) df_UMI.to_csv(self.variant_count_file, sep='\t', index=False) - + + @utils.add_log def write_support_matrix(self): def set_support_bit(row): ref_bit = 1 if row['ref_count'] > 0 else 0 @@ -378,11 +382,11 @@ class Variant_calling(Step): return support_bit df_variant_count = pd.read_csv(self.variant_count_file, sep='\t') - df_variant_count['support'] = self.df_variant_count.apply(set_support_bit, axis=1) + df_variant_count['support'] = df_variant_count.apply(set_support_bit, axis=1) support_mtx = coo_matrix( (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1)) ) - mmwrite(self.support_mtx_file, support_mtx) + mmwrite(self.support_matrix_file, support_mtx) def run(self): @@ -393,10 +397,10 @@ class Variant_calling(Step): self.add_VID() else: self.merge_vcf() - self.parse_vcf() self.write_VID_file() self.get_UMI() self.write_support_matrix() + self.clean_up() @utils.add_log diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index ae938ad0..89e608cc 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -327,7 +327,7 @@ job_end os.system('mkdir -p ./shell/') for sample in self.shell_dict: with open(f'./shell/{sample}.sh', 'w') as f: - f.write("set -e\n") + f.write("set -eo pipefail\n") f.write(self.shell_dict[sample]) def run(self): diff --git a/release_local.py b/release_local.py index 1cbbd0e3..31998936 100755 --- a/release_local.py +++ b/release_local.py @@ -9,7 +9,7 @@ CONDA_ROOT = '/SGRNJ/Public/Software/conda_env/' @add_log def create_conda(): cmd = f""" - set -e + set -eo pipefail conda create -n {ENV_NAME} source activate {ENV_NAME} conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing @@ -25,7 +25,7 @@ def create_conda(): @add_log def lint_code(): cmd = """ - set -e + set -eo pipefail celescope -h pip install -i https://pypi.mirrors.ustc.edu.cn/simple/ pylint # lint -- Gitee From ea713c9b63b95fbca326ad4124a4683b80355f84 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 17 Jun 2021 10:56:13 +0800 Subject: [PATCH 46/96] refactor target_metrics --- celescope/__init__.py | 2 +- celescope/snp/variant_calling.py | 4 +- celescope/tools/target_metrics.py | 87 +++++++++++++++++++------------ docs/snp/snpCalling.md | 21 -------- docs/snp/variant_calling.md | 38 ++++++++++++++ docs/tools/target_metrics.md | 21 +++++--- 6 files changed, 109 insertions(+), 64 deletions(-) delete mode 100644 docs/snp/snpCalling.md create mode 100644 docs/snp/variant_calling.md diff --git a/celescope/__init__.py b/celescope/__init__.py index bea81b71..9d9c8f0f 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -25,5 +25,5 @@ RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ] HELP_DICT = { 'match_dir': 'Match celescope scRNA-Seq directory.', 'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.', - + 'genomeDir': 'Genome directory after running `mkref`.', } diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index 2edc3e24..9b564e38 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -45,7 +45,7 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): class Variant_calling(Step): """ Features - - Perform variant calling + - Perform variant calling. Output @@ -413,7 +413,7 @@ def variant_calling(args): def get_opts_variant_calling(parser, sub_program): - parser.add_argument("--genomeDir", help='Genome directory', required=True) + parser.add_argument("--genomeDir", help=HELP_DICT['genomeDir'], required=True) parser.add_argument( "--vcf", help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py index 1ac9dfed..9101d812 100755 --- a/celescope/tools/target_metrics.py +++ b/celescope/tools/target_metrics.py @@ -8,26 +8,37 @@ from celescope.__init__ import HELP_DICT class Target_metrics(Step): + """ + Features + - Filter bam file + - Filter reads that are not cell-associated. + - Filter reads that are not mapped to target genes. + + - Collect enrichment metrics. + + Output + - `filtered.bam` BAM file after filtering. + """ + def __init__(self, args, step_name): Step.__init__(self, args, step_name) - self.gene_list = args.gene_list - self.match_dir = args.match_dir - self.bam = args.bam - self.out_bam_file = f'{self.outdir}/{self.sample}_filtered.bam' + # set + self.match_barcode = set(utils.parse_match_dir(args.match_dir)["match_barcode"]) + self.gene_list, self.n_gene = utils.read_one_col(args.gene_list) + self.count_dict = utils.genDict(dim=3, valType=int) - def run(self): - gene_list, n_gene = utils.read_one_col(self.gene_list) self.add_metric( name="Number of Target Genes", - value=n_gene, + value=self.n_gene, ) - match_barcode = set(utils.parse_match_dir(self.match_dir)["match_barcode"]) - count_dict = utils.genDict(dim=3, valType=int) - + # out file + self.out_bam_file = f'{self.out_prefix}_filtered.bam' - with pysam.AlignmentFile(self.bam, "rb") as reader: + @utils.add_log + def read_bam_write_filtered(self): + with pysam.AlignmentFile(self.args.bam, "rb") as reader: with pysam.AlignmentFile(self.out_bam_file, "wb", header=reader.header) as writer: for record in reader: try: @@ -36,49 +47,57 @@ class Target_metrics(Step): continue barcode = record.get_tag('CB') UMI = record.get_tag('UB') - if barcode in match_barcode and gene_name in gene_list: + if barcode in self.match_barcode and gene_name in self.gene_list: writer.write(record) - count_dict[barcode][gene_name][UMI] += 1 + self.count_dict[barcode][gene_name][UMI] += 1 - UMIs = 0 + @utils.add_log + def parse_count_dict_add_metrics(self): + total_UMIs = 0 enriched_UMIs = 0 enriched_UMIs_in_cells = 0 - enriched_UMIs_per_cell = [] - - - for barcode in count_dict: - barcode_enriched_UMI = 0 - for gene_name in count_dict[barcode]: - gene_UMI = len(count_dict[barcode][gene_name]) - UMIs += gene_UMI - if gene_name in gene_list: + enriched_UMIs_per_cell_list = [] + + for barcode in self.count_dict: + cell_enriched_UMI = 0 + for gene_name in self.count_dict[barcode]: + gene_UMI = len(self.count_dict[barcode][gene_name]) + total_UMIs += gene_UMI + if gene_name in self.gene_list: enriched_UMIs += gene_UMI - if barcode in match_barcode: + if barcode in self.match_barcode: enriched_UMIs_in_cells += gene_UMI - barcode_enriched_UMI += gene_UMI - if barcode in match_barcode: - enriched_UMIs_per_cell.append(barcode_enriched_UMI) - target_metrics.logger.debug(enriched_UMIs_per_cell) + cell_enriched_UMI += gene_UMI + + if barcode in self.match_barcode: + enriched_UMIs_per_cell_list.append(cell_enriched_UMI) + + self.add_metric( + name="Total UMIs", + value=total_UMIs, + ) self.add_metric( name="Enriched UMIs", value=enriched_UMIs, - total=UMIs, + total=total_UMIs, ) self.add_metric( name="Enriched UMIs in Cells", value=enriched_UMIs_in_cells, - total=UMIs, + total=total_UMIs, ) self.add_metric( name="Median Enriched UMIs per Cell", - value=np.median(enriched_UMIs_per_cell), + value=np.median(enriched_UMIs_per_cell_list), ) + def run(self): + self.read_bam_write_filtered() + self.parse_count_dict_add_metrics() self.clean_up() - @utils.add_log def target_metrics(args): step_name = "target_metrics" @@ -89,7 +108,7 @@ def target_metrics(args): def get_opts_target_metrics(parser, sub_program): parser.add_argument("--gene_list", help=HELP_DICT['gene_list'], required=True) if sub_program: - parser.add_argument("--bam", help='featureCounts bam', required=True) - parser.add_argument('--match_dir', help='match_dir', required=True) + parser.add_argument("--bam", help='Input bam file', required=True) + parser.add_argument('--match_dir', help=HELP_DICT['match_dir'], required=True) parser = s_common(parser) diff --git a/docs/snp/snpCalling.md b/docs/snp/snpCalling.md deleted file mode 100644 index 3814465a..00000000 --- a/docs/snp/snpCalling.md +++ /dev/null @@ -1,21 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--bam` featureCounts bam - -`--match_dir` match scRNA-Seq dir - -`--genomeDir` genomeDir - -`--gene_list` gene_list - diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md new file mode 100644 index 00000000..aed2d6fa --- /dev/null +++ b/docs/snp/variant_calling.md @@ -0,0 +1,38 @@ +## Features +- Perform variant calling. + +## Output + +`{sample}_VID.tsv` A unique numeric ID is assigned for each variant. + +`{sample}_CID.tsv` A unique numeric ID is assigned for each cell. + +`{sample}_variant_count.tsv` Reference and variant supporting reads/UMIs count. + +`{sample}_support.mtx` Support matrix, only high quality bases are considered. +0 : no reads/UMIs cover the position. +1 : all reads/UMIs at the position support the ref allele. +2 : all reads/UMIs at the position support the alt allele. +3 : one or more reads/UMIs support both the alt and the ref allele. + + +## Arguments +`--genomeDir` Genome directory after running `mkref`. + +`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level +and use these variants as input vcf. + +`--bam` Input BAM file from step `target_metrics`. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md index 751de192..d2fbaa04 100644 --- a/docs/tools/target_metrics.md +++ b/docs/tools/target_metrics.md @@ -1,6 +1,21 @@ +## Features +- Filter bam file + - Filter reads that are not cell-associated. + - Filter reads that are not mapped to target genes. + +- Collect enrichment metrics. + +## Output +- `filtered.bam` BAM file after filtering. ## Arguments +`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. + +`--bam` Input bam file + +`--match_dir` Match celescope scRNA-Seq directory. + `--outdir` Output diretory. `--assay` Assay name. @@ -11,9 +26,3 @@ `--debug` If this argument is used, celescope may output addtional file for debugging. -`--bam` featureCounts bam - -`--match_dir` match_dir - -`--gene_list` gene_list - -- Gitee From 328a45c371c19ce25bceaa73b3eb39817e00abdb Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 17 Jun 2021 14:29:44 +0800 Subject: [PATCH 47/96] update --- celescope/snp/multi_snp.py | 2 +- celescope/tests/test_multi.py | 5 +---- celescope/tools/star_mixin.py | 15 +++++---------- celescope/tools/target_metrics.py | 17 ++++++++++++++--- celescope/tools/utils.py | 14 ++++++++++++++ 5 files changed, 35 insertions(+), 18 deletions(-) diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index ea96c69c..bad75811 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -34,7 +34,7 @@ class Multi_snp(Multi): def variant_calling(self, sample): step = 'variant_calling' cmd_line = self.get_cmd_line(step, sample) - bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered.bam' + bam = f'{self.outdir_dic[sample]["target_metrics"]}/{sample}_filtered_sorted.bam' cmd = ( f'{cmd_line} ' f'--bam {bam} ' diff --git a/celescope/tests/test_multi.py b/celescope/tests/test_multi.py index f1ae8032..68c9f23d 100755 --- a/celescope/tests/test_multi.py +++ b/celescope/tests/test_multi.py @@ -3,7 +3,6 @@ Integration tests """ import os -import shutil import subprocess from concurrent import futures @@ -14,7 +13,7 @@ ASSAYS = [ 'vdj', 'tag', 'capture_virus', - #'snp', + 'snp', 'rna', ] @@ -28,8 +27,6 @@ def run_single(assay, test_dir): print("*" * 20 + "running " + assay + "*" * 20) subprocess.check_call('sh run_shell.sh', shell=True) subprocess.check_call('sh sjm.sh', shell=True) - if os.path.exists("test1"): - shutil.rmtree("test1") try: subprocess.check_call('sh ./shell/test1.sh', shell=True) except subprocess.CalledProcessError: diff --git a/celescope/tools/star_mixin.py b/celescope/tools/star_mixin.py index 522eada1..3694959d 100755 --- a/celescope/tools/star_mixin.py +++ b/celescope/tools/star_mixin.py @@ -64,20 +64,15 @@ class StarMixin(): @utils.add_log def sort_bam(self): - cmd = ( - f'samtools sort {self.unsort_STAR_bam} ' - f'-o {self.STAR_bam} ' - f'--threads {self.thread} ' + utils.sort_bam( + self.unsort_STAR_bam, + self.STAR_bam, + threads=self.thread, ) - StarMixin.sort_bam.logger.info(cmd) - subprocess.check_call(cmd, shell=True) @utils.add_log def index_bam(self): - cmd = f"samtools index {self.STAR_bam}" - StarMixin.index_bam.logger.info(cmd) - subprocess.check_call(cmd, shell=True) - + utils.index_bam(self.STAR_bam) def get_star_metrics(self): """ diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py index 9101d812..5fe8aa39 100755 --- a/celescope/tools/target_metrics.py +++ b/celescope/tools/target_metrics.py @@ -24,7 +24,7 @@ class Target_metrics(Step): Step.__init__(self, args, step_name) # set - self.match_barcode = set(utils.parse_match_dir(args.match_dir)["match_barcode"]) + self.match_barcode, _num = utils.read_barcode_file(args.match_dir) self.gene_list, self.n_gene = utils.read_one_col(args.gene_list) self.count_dict = utils.genDict(dim=3, valType=int) @@ -35,6 +35,7 @@ class Target_metrics(Step): # out file self.out_bam_file = f'{self.out_prefix}_filtered.bam' + self.out_bam_file_sorted = f'{self.out_prefix}_filtered_sorted.bam' @utils.add_log def read_bam_write_filtered(self): @@ -45,8 +46,12 @@ class Target_metrics(Step): gene_name = record.get_tag('GN') except KeyError: continue - barcode = record.get_tag('CB') - UMI = record.get_tag('UB') + # compatible with 10X bam + try: + barcode = record.get_tag('CB') + UMI = record.get_tag('UB') + except KeyError: + continue if barcode in self.match_barcode and gene_name in self.gene_list: writer.write(record) self.count_dict[barcode][gene_name][UMI] += 1 @@ -95,6 +100,12 @@ class Target_metrics(Step): def run(self): self.read_bam_write_filtered() self.parse_count_dict_add_metrics() + utils.sort_bam( + self.out_bam_file, + self.out_bam_file_sorted, + threads=self.thread, + ) + utils.index_bam(self.out_bam_file_sorted) self.clean_up() diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 6741903a..8715e4c5 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -809,3 +809,17 @@ def find_step_module_with_folder(assay, step): folder = module_path.split('.')[1] return step_module, folder + + +def sort_bam(input_bam, output_bam, threads=1): + cmd = ( + f'samtools sort {input_bam} ' + f'-o {output_bam} ' + f'--threads {threads} ' + ) + subprocess.check_call(cmd, shell=True) + + +def index_bam(input_bam): + cmd = f"samtools index {input_bam}" + subprocess.check_call(cmd, shell=True) \ No newline at end of file -- Gitee From 078c0058f2607b8ecac688fdf2546e49b30add64 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 17 Jun 2021 16:23:52 +0800 Subject: [PATCH 48/96] add split tag --- celescope/tag/__init__.py | 2 +- celescope/tag/multi_tag.py | 12 +++++++ celescope/tag/split_tag.py | 73 ++++++++++++++++++++++++++++++++++++++ celescope/tools/utils.py | 6 ++-- 4 files changed, 89 insertions(+), 4 deletions(-) create mode 100644 celescope/tag/split_tag.py diff --git a/celescope/tag/__init__.py b/celescope/tag/__init__.py index 3b73e247..2a655dc2 100755 --- a/celescope/tag/__init__.py +++ b/celescope/tag/__init__.py @@ -1,2 +1,2 @@ -__STEPS__ = ['sample', 'barcode', 'cutadapt', 'mapping_tag', 'count_tag', 'analysis_tag'] +__STEPS__ = ['sample', 'barcode', 'cutadapt', 'mapping_tag', 'count_tag', 'analysis_tag', 'split_tag'] __ASSAY__ = 'tag' diff --git a/celescope/tag/multi_tag.py b/celescope/tag/multi_tag.py index bae3cc15..0686fc6b 100755 --- a/celescope/tag/multi_tag.py +++ b/celescope/tag/multi_tag.py @@ -37,6 +37,18 @@ class Multi_tag(Multi): ) self.process_cmd(cmd, step, sample, m=5, x=1) + def split_tag(self, sample): + step = 'split_tag' + umi_tag_file = f'{self.outdir_dic[sample]["count_tag"]}/{sample}_umi_tag.tsv' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--match_dir {self.col4_dict[sample]} ' + f'--umi_tag_file {umi_tag_file} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=1) + + def main(): multi = Multi_tag(__ASSAY__) diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py new file mode 100644 index 00000000..aec7d977 --- /dev/null +++ b/celescope/tag/split_tag.py @@ -0,0 +1,73 @@ +""" +split scRNA-Seq fastq file(01.barcode/{sample}_2.fq) +""" +import glob +import os + +import pysam +import pandas as pd + +import celescope.tools.utils as utils +from celescope.tools.step import Step, s_common +from celescope.__init__ import HELP_DICT + +class Split_tag(Step): + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + # set + + df_umi_tag = pd.read_csv(args.umi_tag_file, sep='\t', index_col=0) + df_umi_tag = df_umi_tag.rename_axis('barcode').reset_index() + self.tag_barcode_dict = {tag: set(row["barcode"].tolist()) for tag, row in df_umi_tag.groupby("tag")} + + if args.split_fastq: + self.rna_fq_file = glob.glob(f'{args.match_dir}/*barcode/*_2.fq*')[0] + + fastq_outdir = f'{args.outdir}/fastqs/' + os.system(f'mkdir -p {fastq_outdir}') + self.fastq_files_handle = {} + for tag in self.tag_barcode_dict: + fastq_file_name = f'{fastq_outdir}/{tag}_2.fq' + self.fastq_files_handle[tag] = open(fastq_file_name, 'w') + + @utils.add_log + def write_fastq_files(self): + read_num = 0 + with pysam.FastxFile(self.rna_fq_file, 'r') as rna_fq: + for read in rna_fq: + read_num += 1 + attr = read.name.strip("@").split("_") + barcode = attr[0] + for tag in self.tag_barcode_dict: + if barcode in self.tag_barcode_dict[tag]: + self.fastq_files_handle[tag].write(str(read)) + + if read_num % 1000000 == 0: + self.write_fastq_files.logger.info(f'{read_num} done') + + for tag in self.tag_barcode_dict: + self.fastq_files_handle[tag].close() + + + @utils.add_log + def run(self): + if self.args.split_fastq: + self.write_fastq_files() + +def split_tag(args): + step_name = "split_tag" + runner = Split_tag(args, step_name) + runner.run() + +def get_opts_split_tag(parser, sub_program): + parser.add_argument( + "--split_fastq", + help="Split scRNA-Seq fastq file(01.barcode/{sample}_2.fq).", + action='store_true', + ) + if sub_program: + parser.add_argument("--umi_tag_file", help="UMI tag file", required=True) + parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True) + s_common(parser) + diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 8715e4c5..95c01547 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -173,11 +173,11 @@ def link_data(outdir, fq_dict): fh.write('ln -sf %s %s\n' % (arr[1], s + '_2.fq.gz')) -def generic_open(file_name, mode='rt'): +def generic_open(file_name, *args, **kwargs): if file_name.endswith('.gz'): - file_obj = gzip.open(file_name, mode) + file_obj = gzip.open(file_name, *args, **kwargs) else: - file_obj = open(file_name, mode) + file_obj = open(file_name, *args, **kwargs) return file_obj @add_log -- Gitee From 40004bf55a722d6edb4756a7d6b0c7627bfcb7f2 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 18 Jun 2021 08:44:06 +0800 Subject: [PATCH 49/96] 1.3.2b1 --- celescope/__init__.py | 2 +- celescope/tag/split_tag.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/celescope/__init__.py b/celescope/__init__.py index 9d9c8f0f..133775be 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -1,6 +1,6 @@ import os -__VERSION__ = "1.3.2b0" +__VERSION__ = "1.3.2b1" __version__ = __VERSION__ ASSAY_DICT = { diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py index aec7d977..2510fd4d 100644 --- a/celescope/tag/split_tag.py +++ b/celescope/tag/split_tag.py @@ -41,7 +41,7 @@ class Split_tag(Step): barcode = attr[0] for tag in self.tag_barcode_dict: if barcode in self.tag_barcode_dict[tag]: - self.fastq_files_handle[tag].write(str(read)) + self.fastq_files_handle[tag].write(str(read) + '\n') if read_num % 1000000 == 0: self.write_fastq_files.logger.info(f'{read_num} done') -- Gitee From a2bdec47ac8b2c8ad683c77941b3ec40f04f198c Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 18 Jun 2021 11:34:48 +0800 Subject: [PATCH 50/96] update --- celescope/tag/split_tag.py | 41 +++++++++++++++++++++++++++++-------- release_local.py | 2 +- wdl/wdl.zip | Bin 17406 -> 17406 bytes 3 files changed, 33 insertions(+), 10 deletions(-) diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py index 2510fd4d..0cfb5ccb 100644 --- a/celescope/tag/split_tag.py +++ b/celescope/tag/split_tag.py @@ -3,6 +3,7 @@ split scRNA-Seq fastq file(01.barcode/{sample}_2.fq) """ import glob import os +from collections import defaultdict import pysam import pandas as pd @@ -26,34 +27,55 @@ class Split_tag(Step): fastq_outdir = f'{args.outdir}/fastqs/' os.system(f'mkdir -p {fastq_outdir}') - self.fastq_files_handle = {} + + self.r2_fastq_files_handle = {} + self.r1_fastq_files_handle = {} for tag in self.tag_barcode_dict: - fastq_file_name = f'{fastq_outdir}/{tag}_2.fq' - self.fastq_files_handle[tag] = open(fastq_file_name, 'w') + r2_fastq_file_name = f'{fastq_outdir}/{tag}_2.fq' + self.r2_fastq_files_handle[tag] = open(r2_fastq_file_name, 'w') + r1_fastq_file_name = f'{fastq_outdir}/{tag}_1.fq' + self.r1_fastq_files_handle[tag] = open(r1_fastq_file_name, 'w') + + self.tag_read_index_dict = defaultdict(set) + @utils.add_log - def write_fastq_files(self): + def write_r2_fastq_files(self): read_num = 0 with pysam.FastxFile(self.rna_fq_file, 'r') as rna_fq: for read in rna_fq: read_num += 1 attr = read.name.strip("@").split("_") barcode = attr[0] + read_index = int(attr[2]) for tag in self.tag_barcode_dict: if barcode in self.tag_barcode_dict[tag]: - self.fastq_files_handle[tag].write(str(read) + '\n') + self.tag_read_index_dict[tag].add(read_index) + self.r2_fastq_files_handle[tag].write(str(read) + '\n') if read_num % 1000000 == 0: - self.write_fastq_files.logger.info(f'{read_num} done') + self.write_r2_fastq_files.logger.info(f'{read_num} done') - for tag in self.tag_barcode_dict: - self.fastq_files_handle[tag].close() + for tag in self.r2_fastq_files_handle: + self.r2_fastq_files_handle[tag].close() + + @utils.add_log + def write_r1_fastq_files(self): + with pysam.FastxFile(self.args.R1_read, 'r') as r1_read: + for read_index, read in enumerate(r1_read, start=1): + for tag in self.tag_read_index_dict: + if read_index in self.tag_read_index_dict[tag]: + self.r1_fastq_files_handle[tag].write(str(read) + '\n') + + for tag in self.r1_fastq_files_handle: + self.r1_fastq_files_handle[tag].close() @utils.add_log def run(self): if self.args.split_fastq: - self.write_fastq_files() + self.write_r2_fastq_files() + self.write_r1_fastq_files() def split_tag(args): step_name = "split_tag" @@ -69,5 +91,6 @@ def get_opts_split_tag(parser, sub_program): if sub_program: parser.add_argument("--umi_tag_file", help="UMI tag file", required=True) parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True) + parser.add_argument("--R1_read", help='R1 read path') s_common(parser) diff --git a/release_local.py b/release_local.py index 31998936..9686de9b 100755 --- a/release_local.py +++ b/release_local.py @@ -14,7 +14,7 @@ def create_conda(): source activate {ENV_NAME} conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing - pip install -i https://pypi.mirrors.ustc.edu.cn/simple/ celescope + pip install --no-cache-dir -i https://pypi.mirrors.ustc.edu.cn/simple/ celescope python setup.py install ln -s /SGRNJ/Database/script/soft/gatk-4.1.8.1/gatk {CONDA_ROOT}/{ENV_NAME}/bin/gatk """ diff --git a/wdl/wdl.zip b/wdl/wdl.zip index 03a2bedfffa11a0f4ff41ab71fbda0203921f927..97cc01e19dfac575d630b594f96a988f0fb10858 100644 GIT binary patch delta 312 zcmey@&iJpLkvG7bnT3l11h#mdn#e1~?wo!wA-nbDL_H_w@Kq-#&ta3BtZv6QalQic zxzi^n9@+?^HkUD;V+1qYnHPf@li#x30SN%Ow~M7{EqoX`+X z24;{mLE0nvXEH%8lbXygqzg8|UT7sVnDJWlIK<|y;yqvi7YP?tuoJp%H9%%eHgp!- ge8$#_87#=@XaW^fn4I9a5G=;xqzzZ-;k1nr0AK2VumAu6 delta 312 zcmey@&iJpLkvG7bnT3l11iAxGOyrefSG}+=;fU6OiF!`VI@|{)&ta3BtZv6QalQg` znEZi>hc<$!&1H<|7{Ls8=EY#f z!Kucs-G3m#Bj~{9R?a67qjI>7n83;x@=jz_uk1S+^wnmKM+_4K!#Pd{26hG+hN8Sg zz4DZt&=5`rW>cVckV_)@XEH%8lbXygqzg8|UT7sVnDJWlIK<|y;yqvi7YP?tW}r_e zciU<(1HC%g&{=Hr8Cxr6u)8=NO_+i10tzZjPH Date: Fri, 18 Jun 2021 13:09:32 +0800 Subject: [PATCH 51/96] fix --- celescope/templates/html/tracer_vdj/base.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/celescope/templates/html/tracer_vdj/base.html b/celescope/templates/html/tracer_vdj/base.html index 5318bb34..335cd104 100755 --- a/celescope/templates/html/tracer_vdj/base.html +++ b/celescope/templates/html/tracer_vdj/base.html @@ -137,7 +137,7 @@ {% include "html/common/cutadapt_summary.html"%} {% endif %} - {% if split_fastq is defined %} + {% if split_fastq_summary is defined %} {% include "html/tracer_vdj/split_fastq_summary.html"%} {% endif %} -- Gitee From 0196a7f9e34ae8cb6c594c68453a0b386703ea3d Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 18 Jun 2021 13:10:38 +0800 Subject: [PATCH 52/96] add convert_summary html --- celescope/templates/html/trust_vdj/base.html | 20 +++++------ .../html/trust_vdj/convert_summary.html | 35 +++++++++++++++++++ 2 files changed, 43 insertions(+), 12 deletions(-) create mode 100644 celescope/templates/html/trust_vdj/convert_summary.html diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html index 5318bb34..83471801 100644 --- a/celescope/templates/html/trust_vdj/base.html +++ b/celescope/templates/html/trust_vdj/base.html @@ -129,24 +129,20 @@ {% include "html/common/sample_summary.html"%} {% endif %} - {% if barcode_summary is defined %} - {% include "html/common/barcode_summary.html"%} + {% if convert_summary is defined %} + {% include "html/trust_vdj/convert_summary.html"%} {% endif %} - {% if cutadapt_summary is defined %} - {% include "html/common/cutadapt_summary.html"%} + {% if trust_assemble_summary is defined %} + {% include "html/trust_vdj/trust_assemble_summary.html"%} {% endif %} - {% if split_fastq is defined %} - {% include "html/tracer_vdj/split_fastq_summary.html"%} + {% if map_summary is defined %} + {% include "html/trust_vdj/map_summary.html"%} {% endif %} - {% if go_assemble_summary is defined %} - {% include "html/tracer_vdj/go_assemble_summary.html"%} - {% endif %} - - {% if vdj_sum_summary is defined %} - {% include "html/tracer_vdj/vdj_sum_summary.html"%} + {% if res_filter_summary is defined %} + {% include "html/trust_vdj/res_filter_summary.html"%} {% endif %} {% if table_dict is defined %} diff --git a/celescope/templates/html/trust_vdj/convert_summary.html b/celescope/templates/html/trust_vdj/convert_summary.html new file mode 100644 index 00000000..4179741f --- /dev/null +++ b/celescope/templates/html/trust_vdj/convert_summary.html @@ -0,0 +1,35 @@ +
+

Demultiplexing

+
+ + + {% for item in convert_summary %} + {% if loop.index <= (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+ + + {% for item in convert_summary %} + {% if loop.index > (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+
+
+
\ No newline at end of file -- Gitee From fc79f3591f27c1850c059b71f3873cc06841ce6b Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 18 Jun 2021 13:11:56 +0800 Subject: [PATCH 53/96] fix a bug of BCR cdr3 aa --- celescope/tracer_vdj/go_assemble.py | 1 + celescope/tracer_vdj/vdj_sum.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index 3a291d39..d614c90c 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -94,6 +94,7 @@ def assemble_summary(outdir, Seqtype, sample, species): 'total_count': total_count, }) os.system(f'rm {outdir}/BR{locus}.sam') + go_assemble_summary.insert(0, { 'item': 'All reads Mapped to IGH, IGL and IGK', 'count': total_mapped, diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index 01a25116..e5fd31a8 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -103,7 +103,7 @@ def filtering(Seqtype, ass_dir, outdir): tmplist = [] for nt in ntseqs: nt = Seq(nt) - nt = nt.reverse_complement() + nt = nt.translate() tmplist.append(str(nt)) tmp.insert(tmp.shape[1], f'IG{locus}_CDR3aa', tmplist) -- Gitee From 8e9c6ef95e86db034e605fe73d4924b76efa319b Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 18 Jun 2021 13:13:16 +0800 Subject: [PATCH 54/96] change matched fq name --- celescope/trust_vdj/res_filter.py | 11 +++++------ celescope/trust_vdj/trust_assemble.py | 9 +++++---- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index ce6b1e2d..c524d4e8 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -8,9 +8,9 @@ def beauty_res(outdir, barcode_report): res = pd.read_csv(barcode_report, sep='\t') rows = res.shape[0] loci = ['A', 'B'] - chians = ['chain2', 'chain1'] + chains = ['chain2', 'chain1'] for l in range(len(loci)): - chain = chians[l] + chain = chains[l] locus = loci[l] Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], [] @@ -65,11 +65,10 @@ class Res_filter(Step): def run(self): barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' res = beauty_res(self.outdir, barcode_report) - filtered = res[(res['TRB_fl']!='0')&(res['TRA_fl']!='0')] - fre = [''] * filtered.shape[0] - filtered.insert(filtered.shape[1], 'Frequent', fre) + fre = [''] * res.shape[0] + res.insert(res.shape[1], 'Frequent', fre) - clones = filtered.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'}) + clones = res.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'}) clones = clones.sort_values(by='Frequent', ascending=False) clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t') diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py index 1f09ab19..cbe9c25c 100644 --- a/celescope/trust_vdj/trust_assemble.py +++ b/celescope/trust_vdj/trust_assemble.py @@ -67,12 +67,12 @@ class Trust_assemble(Step): match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1) cmd1 = ( - f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R1.fq' + f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq' ) os.system(cmd1) cmd2 = ( - f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_R2.fq' + f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq' ) os.system(cmd2) @@ -92,14 +92,15 @@ class Trust_assemble(Step): ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa' cmd = ( f'{TRUST} -t {self.thread} ' - f'-u {self.outdir}/{self.sample}_R2.fq ' - f'--barcode {self.outdir}/{self.sample}_R1.fq ' + f'-u {self.outdir}/{self.sample}_matched_R2.fq ' + f'--barcode {self.outdir}/{self.sample}_matched_R1.fq ' f'--barcodeRange 0 23 + ' f'-f {index_file} ' f'--ref {ref} ' f'-o {self.sample} --od {self.outdir}/TRUST4' ) + Trust_assemble.run.logger.info(cmd) os.system(cmd) os.remove(f'{self.outdir}/seqlist.txt') -- Gitee From 7b80c6a5f101b1ea0d64902dfa029d4672cd7a57 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 18 Jun 2021 13:13:54 +0800 Subject: [PATCH 55/96] add seqkt --- conda_pkgs.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/conda_pkgs.txt b/conda_pkgs.txt index 3f032270..22f5fab0 100755 --- a/conda_pkgs.txt +++ b/conda_pkgs.txt @@ -10,4 +10,5 @@ r-argparser r-tidyverse mixcr=3.0.3 bioconductor-dropletutils -bcftools==1.9 \ No newline at end of file +bcftools==1.9 +seqkt \ No newline at end of file -- Gitee From e4a888f65c02d3a37c3d2fd4b900daff4883279c Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 18 Jun 2021 15:51:18 +0800 Subject: [PATCH 56/96] remove glob_genomeDir --- celescope/capture_rna/tests.py | 35 --------------------------------- celescope/tools/count.py | 8 ++++---- celescope/tools/utils.py | 36 ---------------------------------- 3 files changed, 4 insertions(+), 75 deletions(-) delete mode 100755 celescope/capture_rna/tests.py diff --git a/celescope/capture_rna/tests.py b/celescope/capture_rna/tests.py deleted file mode 100755 index 644953d2..00000000 --- a/celescope/capture_rna/tests.py +++ /dev/null @@ -1,35 +0,0 @@ -import unittest - -from celescope.tools.report import reporter - - -class testHLA(unittest.TestCase): - def setUp(self): - ''' - os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/0910_panel/') - self.sample = 'S20071508_D_TS' - count_detail_file = './/S20071508_D_TS/05.count_capture_rna/S20071508_D_TS_count_detail.txt' - self.df = pd.read_table(count_detail_file, header=0) - self.match_dir = '/SGRNJ02/RandD4/RD20051303_Panel/20200729/S20071508_D_ZL' - self.sc_cell_barcodes, self.sc_cell_number = read_barcode_file(self.match_dir) - self.outdir = f'{self.sample}/05.count_capture_rna/' - self.genomeDir = '/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92' - self.validated_barcodes, _ = read_one_col(f'{self.sample}/05.count_capture_rna/{self.sample}_matrix_10X/barcodes.tsv') - _refFlat, self.gtf = glob_genomeDir(self.genomeDir) - self.assay = 'capture_rna' - ''' - - @unittest.skip('pass') - def test_report(self): - t = reporter(assay=self.assay, - name='count_capture_rna', sample=self.sample, - stat_file=self.outdir + '/stat.txt', - outdir=self.outdir + '/..') - t.get_report() - - - - - -if __name__ == '__main__': - unittest.main() \ No newline at end of file diff --git a/celescope/tools/count.py b/celescope/tools/count.py index 44997442..f3635ee5 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -21,6 +21,7 @@ from celescope.tools.__init__ import (BARCODE_FILE_NAME, FEATURE_FILE_NAME, from celescope.tools.cellranger3 import get_plot_elements from celescope.tools.cellranger3.cell_calling_3 import cell_calling_3 from celescope.tools.step import Step, s_common +from celescope.rna.mkref import parse_genomeDir_rna TOOLS_DIR = os.path.dirname(__file__) random.seed(0) @@ -76,10 +77,9 @@ class Count(Step): self.cell_calling_method = args.cell_calling_method self.expected_cell_num = int(args.expected_cell_num) self.bam = args.bam - if args.genomeDir and args.genomeDir != "None": - _refFlat, self.gtf_file, _ = utils.glob_genomeDir(args.genomeDir) - else: - self.gtf_file = args.gtf + + # set + self.gtf_file = parse_genomeDir_rna(args.genomeDir)['gtf'] self.id_name = utils.get_id_name_dict(self.gtf_file) # output files diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 95c01547..e613b0d8 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -500,42 +500,6 @@ def format_ratios(ratios: dict): ratios[key] = round(ratios[key] * 100, 2) -@add_log -def glob_genomeDir(genomeDir, fa=False): - refFlat = glob.glob(genomeDir + "/*.refFlat") - if (len(refFlat) > 1): - sys.exit("ERROR: Multiple refFlat file in " + genomeDir) - elif (len(refFlat) == 0): - sys.exit("ERROR: refFlat file not found in " + genomeDir) - else: - refFlat = refFlat[0] - glob_genomeDir.logger.info("refFlat file found: " + refFlat) - - gtf = glob.glob(genomeDir + "/*.gtf") - if (len(gtf) == 0): - sys.exit("ERROR: gtf file not found in " + genomeDir) - elif (len(gtf) > 1): - gtf = glob.glob(genomeDir + "/*.chr.gtf") - if (len(gtf) == 0): - sys.exit("ERROR: No chr gtf file in "+ genomeDir) - if (len(gtf) > 1): - sys.exit("ERROR: Multiple gtf file in " + genomeDir) - else: - gtf = gtf[0] - glob_genomeDir.logger.info("chr gtf file found: " + gtf) - else: - gtf = gtf[0] - glob_genomeDir.logger.info("gtf file found: " + gtf) - - if fa: - fasta = glob.glob(genomeDir + "/*.fa") + glob.glob(genomeDir + "/*.fasta") - if len(fasta) > 1: - sys.exit("ERROR: Multiple fasta file in " + genomeDir) - fasta = fasta[0] - return refFlat, gtf, fasta - return refFlat, gtf, None - - def get_slope(x, y, window=200, step=10): assert len(x) == len(y) start = 0 -- Gitee From 3ff563edd5d92bfb65f9bc31aaa3c443f50408db Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 18 Jun 2021 16:28:12 +0800 Subject: [PATCH 57/96] barcode compatible with 10X bam --- celescope/snp/variant_calling.py | 8 +++++--- docs/CHANGELOG.md | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index 9b564e38..d36c83c1 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -117,8 +117,11 @@ class Variant_calling(Step): samfile = pysam.AlignmentFile(self.splitN_bam, "rb") header = samfile.header for read in samfile: - attr = read.query_name.split('_') - barcode = attr[0] + try: + barcode = read.get_tag('CB') + UMI = read.get_tag('UB') + except KeyError: + continue if barcode in self.barcodes: CID = self.barcodes.index(barcode) + 1 read.set_tag(tag='CL', value=f'CELL{CID}', value_type='Z') @@ -308,7 +311,6 @@ class Variant_calling(Step): VID_vcf.close() @staticmethod - @utils.add_log def cell_UMI(CID, outdir, final_vcf_file): df_vcf = parse_vcf(final_vcf_file) df_UMI = pd.DataFrame(columns=['VID', 'CID', 'ref_count', 'alt_count']) diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md index 4de53cb9..109dd6d7 100755 --- a/docs/CHANGELOG.md +++ b/docs/CHANGELOG.md @@ -2,9 +2,14 @@ ## [unreleased] - 2021-06-09 ### Added + ### Changed + ### Fixed +- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`. + ### Removed +- `celescope.tools.utils.glob_genomeDir` ## [1.3.1] - 2021-06-09 ### Added -- Gitee From 6ae9e593b6892ea94ba710f943df34a3e3117726 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 18 Jun 2021 18:11:55 +0800 Subject: [PATCH 58/96] use barcode_report.tsv to summary --- celescope/trust_vdj/res_filter.py | 120 ++++++++++++++++++------------ 1 file changed, 74 insertions(+), 46 deletions(-) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index c524d4e8..8e876672 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -1,57 +1,78 @@ import pandas as pd from celescope.tools.Step import Step, s_common from celescope.tools import utils +from collections import defaultdict @utils.add_log -def beauty_res(outdir, barcode_report): - res = pd.read_csv(barcode_report, sep='\t') - rows = res.shape[0] - loci = ['A', 'B'] +def beauty_report(barcode_report): + df = pd.read_csv(barcode_report, sep='\t') + rows = df.shape[0] chains = ['chain2', 'chain1'] - for l in range(len(loci)): + dic = defaultdict(list) + + for l in range(len(chains)): chain = chains[l] - locus = loci[l] - Vgenes, Dgenes, Jgenes, Cgenes, cdr3nts, cdr3aas, readcounts, fuls = [], [], [], [], [], [], [], [] + items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'full_length_assembly': -1} for i in range(rows): - attr = res.loc[i, chain] - attrs = attr.split(',') - if len(attrs) == 10: - V, D, J, C, cdr3nt, cdr3aa, readcount, fl = attrs[0], attrs[1], attrs[2], attrs[3], attrs[4], attrs[5], attrs[6], attrs[-1] - Vgenes.append(V) - Dgenes.append(D) - Jgenes.append(J) - Cgenes.append(C) - cdr3nts.append(cdr3nt) - cdr3aas.append(cdr3aa) - readcounts.append(readcount) - fuls.append(fl) - elif len(attrs) != 10: - Vgenes.append('NAN') - Dgenes.append('NAN') - Jgenes.append('NAN') - Cgenes.append('NAN') - cdr3nts.append('NAN') - cdr3aas.append('NAN') - readcounts.append('NAN') - fuls.append('NAN') - - res[f'TR{locus}_V'] = Vgenes - res[f'TR{locus}_D'] = Dgenes - res[f'TR{locus}_J'] = Jgenes - res[f'TR{locus}_C'] = Cgenes - res[f'TR{locus}_cdr3nt'] = cdr3nts - res[f'TR{locus}_cdr3aa'] = cdr3aas - res[f'TR{locus}_readcount'] = readcounts - res[f'TR{locus}_fl'] = fuls - - res.to_csv(f'{outdir}/new_barcode_report.tsv', sep='\t') + cb = df.loc[i, '#barcode'] + dic['barcode'].append(cb) + for item in items: + attr = df.loc[i, chain] + attrs = attr.split(',') + + if len(attrs) == 10: + dic[f'{item}'].append(attrs[items[item]]) + + elif len(attrs) != 10: + dic[f'{item}'].append('None') + + res = pd.DataFrame(dic, columns=list(dic.keys())) return res +def get_clone_table(df, Seqtype): + res = pd.DataFrame() + group_type = [] + if Seqtype == 'TCR': + chains = ['TRA', 'TRB'] + if Seqtype == 'BCR': + chains = ['IGH', 'IGL', 'IGK'] + for chain in chains: + tmp = df[df['V'].str.contains(chain, na=False)] + tmp = tmp.set_index('barcode') + tmp = tmp.rename(columns=lambda x: f'{chain}_'+x) + + res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None') + group_type.append(f'{chain}_CDR3aa') + + + Frequent = [''] * res.shape[0] + res.insert(res.shape[1], 'Frequent', Frequent) + clonetypes = res.groupby(group_type).agg({'Frequent': 'count'}) + clonetypes = clonetypes.sort_values(by='Frequent', ascending=False) + + sum_c = clonetypes['Frequent'].sum() + proportions = [] + for f in list(clonetypes['Frequent']): + p = f/sum_c + p = p * 100 + p = round(p, 2) + p = str(p) + '%' + proportions.append(p) + clonetypes['Proportion'] = proportions + clonetypes = clonetypes.sort_values(by='Frequent', ascending=False) + clonetypes = clonetypes.reset_index() + + clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))] + clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequent', 'Proportion'])) + + return clonetypes + + class Res_filter(Step): def __init__(self, args, step_name): @@ -59,20 +80,26 @@ class Res_filter(Step): self.outdir = args.outdir self.sample = args.sample + self.Seqtype = args.Seqtype @utils.add_log def run(self): barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' - res = beauty_res(self.outdir, barcode_report) - fre = [''] * res.shape[0] - res.insert(res.shape[1], 'Frequent', fre) + df = beauty_report(barcode_report) + df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t') - clones = res.groupby(['TRA_cdr3aa', 'TRB_cdr3aa']).agg({'Frequent': 'count'}) - clones = clones.sort_values(by='Frequent', ascending=False) + clones = get_clone_table(df, self.Seqtype) clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t') + title = 'Clonetypes' + table_dict = self.get_table(title, 'clonetypes_table', clones) + + self.add_data_item(table_dict=table_dict) + + self.clean_up() + @utils.add_log def res_filter(args): @@ -82,5 +109,6 @@ def res_filter(args): def get_opts_res_filter(parser, sub_program): - if sub_program: - parser = s_common(parser) \ No newline at end of file + parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + if sub_program: + parser = s_common(parser) \ No newline at end of file -- Gitee From e94485ae8be761110dd919557a10d04466b91388 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Tue, 22 Jun 2021 16:10:48 +0800 Subject: [PATCH 59/96] upgrade pandas; remove mutract --- celescope/tools/count.py | 2 +- requirements.txt | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index f3635ee5..26ea2be7 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -338,7 +338,7 @@ class Count(Step): os.mkdir(matrix_dir) df_UMI = df.groupby(['geneID', 'Barcode']).agg({'UMI': 'count'}) - mtx = coo_matrix((df_UMI.UMI, (df_UMI.index.labels[0], df_UMI.index.labels[1]))) + mtx = coo_matrix((df_UMI.UMI, (df_UMI.index.codes[0], df_UMI.index.codes[1]))) gene_id = df_UMI.index.levels[0].to_series() # add gene symbol gene_name = gene_id.apply(lambda x: self.id_name[x]) diff --git a/requirements.txt b/requirements.txt index 6aa50c92..42f0e02c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,11 +2,9 @@ cutadapt==1.17 pysam==0.16.0.1 scipy==1.4.1 numpy==1.19.5 -pandas==0.23.4 jinja2>=2.10 -matplotlib==2.2.2 xopen>=0.5.0 editdistance>=0.5.3 -mutract sklearn==0.0 plotly==4.14.3 +plotnine==0.8.0 -- Gitee From b62eab8be0eb0a926ff61c0bb4ebac8fe88291b8 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 22 Jun 2021 19:11:22 +0800 Subject: [PATCH 60/96] add report --- celescope/__init__.py | 4 +- celescope/templates/html/trust_vdj/base.html | 4 - .../html/trust_vdj/res_filter_summary.html | 36 ++++++ .../trust_vdj/trust_assemble_summary.html | 40 ++++++ celescope/trust_vdj/res_filter.py | 110 ++++++++++++++-- celescope/trust_vdj/trust_assemble.py | 117 +++++++++++++++--- 6 files changed, 272 insertions(+), 39 deletions(-) create mode 100644 celescope/templates/html/trust_vdj/res_filter_summary.html create mode 100644 celescope/templates/html/trust_vdj/trust_assemble_summary.html diff --git a/celescope/__init__.py b/celescope/__init__.py index d1f25d8b..983fb9fc 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -14,6 +14,6 @@ ASSAY_DICT = { 'tag': 'Single Cell tag', 'citeseq': 'Single Cell CITE-Seq', 'tcr_fl': 'Single Cell full length TCR', - 'tracer_vdj': 'Single Cell Full Length vdj assemble', - 'trust_vdj': 'Single Cell Full Length vdj assemble' + 'tracer_vdj': 'Single Cell Full Length V(D)J Assemble', + 'trust_vdj': 'Single Cell Full Length V(D)J Assemble' } diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html index 83471801..fcd8607c 100644 --- a/celescope/templates/html/trust_vdj/base.html +++ b/celescope/templates/html/trust_vdj/base.html @@ -137,10 +137,6 @@ {% include "html/trust_vdj/trust_assemble_summary.html"%} {% endif %} - {% if map_summary is defined %} - {% include "html/trust_vdj/map_summary.html"%} - {% endif %} - {% if res_filter_summary is defined %} {% include "html/trust_vdj/res_filter_summary.html"%} {% endif %} diff --git a/celescope/templates/html/trust_vdj/res_filter_summary.html b/celescope/templates/html/trust_vdj/res_filter_summary.html new file mode 100644 index 00000000..f4a403f6 --- /dev/null +++ b/celescope/templates/html/trust_vdj/res_filter_summary.html @@ -0,0 +1,36 @@ +
+

Cell

+
+ + + {% for item in res_filter_summary %} + + {% for i in item %} + + {% endfor %} + + {% endfor %} +
{{ i|e }}
+ +
+ {{ chart|safe }} +
+ +
+ +
+
+
\ No newline at end of file diff --git a/celescope/templates/html/trust_vdj/trust_assemble_summary.html b/celescope/templates/html/trust_vdj/trust_assemble_summary.html new file mode 100644 index 00000000..0d18b19f --- /dev/null +++ b/celescope/templates/html/trust_vdj/trust_assemble_summary.html @@ -0,0 +1,40 @@ +
+

Mapping

+
+ + + {% for item in trust_assemble_summary %} + {% if loop.index <= (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+ + + {% for item in trust_assemble_summary %} + {% if loop.index > (loop.length+1)/2 %} + + {% for i in item %} + + {% endfor %} + + {% endif %} + {% endfor %} +
{{ i|e }}
+
+
+
\ No newline at end of file diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index 8e876672..883ae849 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -2,6 +2,8 @@ import pandas as pd from celescope.tools.Step import Step, s_common from celescope.tools import utils from collections import defaultdict +from celescope.tools.cellranger3 import get_plot_elements +import numpy as np @utils.add_log @@ -35,24 +37,27 @@ def beauty_report(barcode_report): def get_clone_table(df, Seqtype): + res_filter_summary = [] + res = pd.DataFrame() group_type = [] if Seqtype == 'TCR': chains = ['TRA', 'TRB'] + paired_groups = ['TRA_TRB'] if Seqtype == 'BCR': chains = ['IGH', 'IGL', 'IGK'] - for chain in chains: - tmp = df[df['V'].str.contains(chain, na=False)] - tmp = tmp.set_index('barcode') - tmp = tmp.rename(columns=lambda x: f'{chain}_'+x) - - res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None') - group_type.append(f'{chain}_CDR3aa') - + paired_groups = ['IGH_IHL', 'IGH_IGK'] + for chain in chains: + tmp = df[df['V'].str.contains(chain, na=False)] + tmp = tmp.set_index('barcode') + tmp = tmp.rename(columns=lambda x: f'{chain}_'+x) + + res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None') + group_type.append(f'{chain}_CDR3aa') Frequent = [''] * res.shape[0] res.insert(res.shape[1], 'Frequent', Frequent) - clonetypes = res.groupby(group_type).agg({'Frequent': 'count'}) + clonetypes = res.groupby(group_type, as_index=False).agg({'Frequent': 'count'}) clonetypes = clonetypes.sort_values(by='Frequent', ascending=False) sum_c = clonetypes['Frequent'].sum() @@ -67,10 +72,44 @@ def get_clone_table(df, Seqtype): clonetypes = clonetypes.sort_values(by='Frequent', ascending=False) clonetypes = clonetypes.reset_index() - clonetypes['CloneId'] = [i for i in range(1, (clonetypes.shape[0]+1))] - clonetypes = clonetypes.reindex(columns=list(['CloneId', 'TRA_CDR3aa', 'TRB_CDR3aa', 'Frequent', 'Proportion'])) + clonetype_ids = [(i+1) for i in clonetypes.index.tolist()] + clonetypes['index'] = clonetype_ids + clonetypes = clonetypes.rename(columns={'index': 'CloneId'}) - return clonetypes + total_count = int(clonetypes['Frequent'].sum()) + + res_filter_summary.append({ + 'item': 'Estimated Number of Cells', + 'count': total_count, + 'total_count': np.nan + }) + + for group in group_type: + chain = group.strip('_CDR3aa') + tmp = clonetypes[clonetypes[group]!='None'] + count = int(tmp['Frequent'].sum()) + item = f'Cells with {chain}' + res_filter_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count + }) + + for pg in paired_groups: + attrs = pg.split('_') + chain1 = attrs[0] + chain2 = attrs[1] + tmp = clonetypes[(clonetypes[f'{chain1}_CDR3aa']!='None') & (clonetypes[f'{chain2}_CDR3aa']!='None')] + item = f'Cells with paired {chain1} and {chain2}' + count = int(tmp['Frequent'].sum()) + res_filter_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count + }) + + + return clonetypes, res_filter_summary class Res_filter(Step): @@ -81,15 +120,52 @@ class Res_filter(Step): self.outdir = args.outdir self.sample = args.sample self.Seqtype = args.Seqtype + self.full_length = args.full_length @utils.add_log def run(self): barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' df = beauty_report(barcode_report) + + if self.full_length: + df = df[df['full_length_assembly']=='1'] df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t') - clones = get_clone_table(df, self.Seqtype) + clones, res_filter_summary = get_clone_table(df, self.Seqtype) + + # plot barcode umi + count_file = f'{self.outdir}/../02.trust_assemble/count.txt' + df_umi = pd.read_csv(count_file, sep='\t', index_col=False) + cells = set(df['barcode'].tolist()) + df_umi['mark'] = df_umi['barcode'].apply(lambda x: 'CB' if (x in cells) else 'UB') + df_umi = df_umi.sort_values(by='UMI', ascending=False) + df_umi.to_csv(count_file, sep='\t', index=False) + + self.add_data_item(chart=get_plot_elements.plot_barcode_rank(count_file)) + + if self.Seqtype == 'TCR': + chains = ['TRA', 'TRB'] + elif self.Seqtype == 'BCR': + chains = ['IGH', 'IGL', 'IGK'] + + for chain in chains: + tmp = df[df['V'].str.contains(chain, na=False)] + barcodes = tmp['barcode'].tolist() + if len(barcodes) != 0: + df_bc = pd.DataFrame(barcodes, columns=['barcode']) + else: + continue + + tmp_df = pd.merge(df_umi, df_bc, on='barcode', how='inner') + + mid = int(tmp_df['UMI'].median()) + item = f'Median {chain} UMIs per cell' + res_filter_summary.append({ + 'item': item, + 'count': mid, + 'total_count': np.nan + }) clones.to_csv(f'{self.outdir}/clonetype.tsv', sep='\t') @@ -98,6 +174,13 @@ class Res_filter(Step): self.add_data_item(table_dict=table_dict) + + stat_file = self.outdir + '/stat.txt' + + sum_df = pd.DataFrame(res_filter_summary, columns=['item', 'count', 'total_count']) + + utils.gen_stat(sum_df, stat_file) + self.clean_up() @@ -110,5 +193,6 @@ def res_filter(args): def get_opts_res_filter(parser, sub_program): parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) + parser.add_argument('--full_length', help='only output full length assembly', action='store_true') if sub_program: parser = s_common(parser) \ No newline at end of file diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py index cbe9c25c..e052d0ae 100644 --- a/celescope/trust_vdj/trust_assemble.py +++ b/celescope/trust_vdj/trust_assemble.py @@ -5,43 +5,109 @@ from celescope.tracer_vdj.split_fastq import get_barcodes from celescope.tools.barcode import * import pysam import pandas as pd +from collections import defaultdict TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4' def count_fq(fq1): - bcs, umis, names = [], [], [] - count_df = pd.DataFrame() + dic = defaultdict(list) with pysam.FastxFile(fq1) as fq: for entry in fq: attr = entry.sequence cb = attr[:24] umi = attr[24:] name = entry.name - bcs.append(cb) - umis.append(umi) - names.append(name) - count_df['barcode'] = bcs - count_df['UMI'] = umis - count_df['seq_name'] = names + dic['barcode'].append(cb) + dic['UMI'].append(umi) + dic['seq_name'].append(name) + + count_df = pd.DataFrame(dic, columns=list(dic.keys())) return count_df + @utils.add_log def match_barcodes(outdir, match_dir, Seqtype, fq1): annotated_bcs = get_barcodes(match_dir, Seqtype) bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode']) count_df = count_fq(fq1) - df = pd.merge(bcs_df, count_df, on='barcode', how='inner') - seqnames = df['seq_name'].tolist() + + # count UMI + df_umi = count_df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'}) + df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'}) + df_umi = df_umi.sort_values(by='UMI', ascending=False) + df_umi.to_csv(f'{outdir}/count.txt', sep='\t', index=False) + + df_n = pd.merge(bcs_df, count_df, on='barcode', how='inner') + seqnames = df_n['seq_name'].tolist() seqlist = open(f'{outdir}/seqlist.txt', 'w') for name in seqnames: seqlist.write(str(name) + '\n') - count_df.to_csv(f'{outdir}/count.txt', sep='\t') - df.to_csv(f'{outdir}/matched_count.txt', sep='\t') + +def mapping_summary(outdir, Seqtype, fq, species): + stat_file = outdir + '/stat.txt' + + trust_assemble_summary = [] + + total_mapped = 0 + + #with pysam.FastxFile(fq) as fh: + #total_count = 0 + #for entry in fh: + #total_count += 1 + + if Seqtype == 'TCR': + loci = ['TRA', 'TRB'] + stat_string = 'All reads Mapped to TRA and TRB' + + elif Seqtype == 'BCR': + loci = ['IGH', 'IGL', 'IGK'] + stat_string = 'All reads Mapped to IGH, IGL and IGK' + + for locus in loci: + cmd = ( + f'source activate bracer; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} ' + f'-U {fq} ' + f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + + with open(f'{outdir}/log') as fh: + for line in fh: + if 'reads; of these:' in line: + attr = re.findall(r'\d+', line) + total_count = int(attr[0]) + if 'aligned exactly 1 time' in line: + res = re.findall(r"\d+", line) + item = f'Reads mapped to {locus}' + count = int(res[0]) + total_mapped += count + trust_assemble_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count, + }) + + os.system(f'rm {outdir}/{locus}.sam') + + trust_assemble_summary.insert(0, { + 'item': stat_string, + 'count': total_mapped, + 'total_count': total_count + }) + + os.system(f'rm {outdir}/log') + + df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count']) + + utils.gen_stat(df, stat_file) + class Trust_assemble(Step): """ @@ -60,6 +126,7 @@ class Trust_assemble(Step): self.fq2 = args.fq2 self.sample = args.sample self.species = args.species + self.speed_up = args.speed_up @utils.add_log @@ -84,12 +151,12 @@ class Trust_assemble(Step): species = self.species - if species =='Mmus': - index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/GRCm38_bcrtcr.fa' - ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/mouse/mouse_IMGT+C.fa' - elif species == 'Hsap': - index_file = '/SGRNJ03/randd/zhouxin/software/TRUST4/hg38_bcrtcr.fa' - ref = '/SGRNJ03/randd/zhouxin/software/TRUST4/human_IMGT+C.fa' + index_file = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa' + ref = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_IMGT+C.fa' + + string1 = '' + if self.speed_up: + string1 = '--repseq ' cmd = ( f'{TRUST} -t {self.thread} ' f'-u {self.outdir}/{self.sample}_matched_R2.fq ' @@ -97,14 +164,23 @@ class Trust_assemble(Step): f'--barcodeRange 0 23 + ' f'-f {index_file} ' f'--ref {ref} ' + f'{string1}' f'-o {self.sample} --od {self.outdir}/TRUST4' ) Trust_assemble.run.logger.info(cmd) - os.system(cmd) + + if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'): + os.system(cmd) + + #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' + + mapping_summary(self.outdir, self.Seqtype, self.fq2, species) os.remove(f'{self.outdir}/seqlist.txt') + self.clean_up() + @utils.add_log def trust_assemble(args): @@ -120,7 +196,8 @@ def get_opts_trust_assemble(parser, sub_program): parser.add_argument('--fq2', help='R2 reads from barcode step', required=True) parser.add_argument('--match_dir', help='match_dir', required=True) parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) - parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true') -- Gitee From 12e33fb7125cfda9e002c36fa336641ae4f9b365 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 22 Jun 2021 19:12:24 +0800 Subject: [PATCH 61/96] rm Median UMIs per cell --- celescope/tracer_vdj/vdj_sum.py | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/celescope/tracer_vdj/vdj_sum.py b/celescope/tracer_vdj/vdj_sum.py index e5fd31a8..daed728d 100644 --- a/celescope/tracer_vdj/vdj_sum.py +++ b/celescope/tracer_vdj/vdj_sum.py @@ -216,12 +216,7 @@ class Vdj_sum(Step): 'count': paired_cell, 'total_count': productive_cells_num, }) - - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': median_all, - 'total_count': np.nan - }) + for locus in loci: tmp = glob.glob(f'{ass_dir}/tracer/*/aligned_reads/*_TCR_{locus}.fastq') @@ -310,11 +305,6 @@ class Vdj_sum(Step): 'total_count': productive_cells_num }) - vdj_sum_summary.append({ - 'item': 'Median UMIs per cell', - 'count': median_all, - 'total_count': np.nan - }) for locus in loci: tmp = glob.glob(f'{ass_dir}/bracer/*/aligned_reads/*_BCR_{locus}.fastq') -- Gitee From 035d1c1a6c24731fc29110f89ebfe735ea1daadd Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Tue, 22 Jun 2021 19:12:57 +0800 Subject: [PATCH 62/96] rewrite map --- celescope/tracer_vdj/go_assemble.py | 101 ++++++++++------------------ 1 file changed, 36 insertions(+), 65 deletions(-) diff --git a/celescope/tracer_vdj/go_assemble.py b/celescope/tracer_vdj/go_assemble.py index d614c90c..cda37f1b 100755 --- a/celescope/tracer_vdj/go_assemble.py +++ b/celescope/tracer_vdj/go_assemble.py @@ -30,77 +30,48 @@ def assemble_summary(outdir, Seqtype, sample, species): total_count = count_df['readcount'].sum() + total_mapped = 0 + if Seqtype == 'TCR': loci = ['A', 'B'] - - total_mapped = 0 - - for locus in loci: - cmd = ( - f'source activate {BRACER_CONDA}; ' - f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' - f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} ' - f'-U {clean_fq} ' - f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1' - ) - os.system(cmd) - with open(f'{outdir}/log') as fh: - for line in fh: - if 'aligned exactly 1 time' in line: - res = re.findall(r"\d+", line) - item = f'Reads mapped to TR{locus}' - count = int(res[0]) - total_mapped += count - go_assemble_summary.append({ - 'item': item, - 'count': count, - 'total_count': total_count, - }) - - os.system(f'rm {outdir}/TR{locus}.sam') - - go_assemble_summary.insert(0, { - 'item': 'All reads Mapped to TRA and TRB', - 'count': total_mapped, - 'total_count': total_count - }) - - os.system(f'rm {outdir}/log') + stat_string = 'All reads Mapped to TRA and TRB' elif Seqtype == 'BCR': loci = ['H', 'L', 'K'] + stat_string = 'All reads Mapped to IGH, IGL and IGK' + + for locus in loci: + cmd = ( + f'source activate {BRACER_CONDA}; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/tracer/resources/{species}/combinatorial_recombinomes/TCR_{locus} ' + f'-U {clean_fq} ' + f'-S {outdir}/TR{locus}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + with open(f'{outdir}/log') as fh: + for line in fh: + if 'aligned exactly 1 time' in line: + res = re.findall(r"\d+", line) + item = f'Reads mapped to TR{locus}' + count = int(res[0]) + total_mapped += count + go_assemble_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count, + }) + + os.system(f'rm {outdir}/TR{locus}.sam') + + go_assemble_summary.insert(0, { + 'item': stat_string, + 'count': total_mapped, + 'total_count': total_count + }) + + os.system(f'rm {outdir}/log') - total_mapped = 0 - - for locus in loci: - cmd = ( - f'source activate {BRACER_CONDA}; ' - f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' - f'-x /SGRNJ03/randd/zhouxin/software/bracer/resources/{species}/combinatorial_recombinomes/BCR_{locus} ' - f'-U {clean_fq} ' - f'-S {outdir}/BR{locus}.sam > {outdir}/log 2>&1' - ) - os.system(cmd) - with open(f'{outdir}/log') as fh: - for line in fh: - if 'aligned exactly 1 time' in line: - res = re.findall(r"\d+", line) - item = f'Reads mapped to BR{locus}' - count = int(res[0]) - total_mapped += count - go_assemble_summary.append({ - 'item': item, - 'count': count, - 'total_count': total_count, - }) - os.system(f'rm {outdir}/BR{locus}.sam') - - go_assemble_summary.insert(0, { - 'item': 'All reads Mapped to IGH, IGL and IGK', - 'count': total_mapped, - 'total_count': total_count - }) - os.system(f'rm {outdir}/log') df = pd.DataFrame(go_assemble_summary, columns=['item', 'count', 'total_count']) -- Gitee From 46863cc82edcb129111d476dde3e74b9f3ac7847 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 11:02:34 +0800 Subject: [PATCH 63/96] plot_vid --- celescope/snp/utils/plot_vid.py | 88 +++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 celescope/snp/utils/plot_vid.py diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py new file mode 100644 index 00000000..86f489f5 --- /dev/null +++ b/celescope/snp/utils/plot_vid.py @@ -0,0 +1,88 @@ +import ast +import argparse +import glob +import os + +import pandas as pd +from plotnine import aes, geom_point, ggplot + +from celescope.celescope import ArgFormatter +import celescope.tools.utils as utils + + +SAMPLE_COL_INDEX = 2 +MATCH_DIR_COL_INDEX = 3 +VID_COL_INDEX = 4 + +@utils.add_log +def parse_mapfile(mapfile): + sample_vid_dict = {} + sample_match_dir_dict = {} + df_mapfile = pd.read_csv(mapfile, sep='\t', header=None) + def read_row(row): + sample = row[SAMPLE_COL_INDEX] + match_dir = row[MATCH_DIR_COL_INDEX] + vid_list = [int(vid) for vid in row[VID_COL_INDEX ].strip().split(',')] + sample_vid_dict[sample] = vid_list + sample_match_dir_dict[sample] = match_dir + + df_mapfile.apply(read_row, axis=1) + return sample_vid_dict, sample_match_dir_dict + +class Plot_vid(): + def __init__(self, sample, outdir, vid_list, snp_dir, match_dir): + self.sample = sample + self.vid_list = vid_list + + # set + vid_tsne_file = glob.glob(f'{snp_dir}/08.analysis_snp/*count_tsne.tsv')[0] + self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID":ast.literal_eval}) + match_tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] + self.df_match_tsne = pd.read_csv( match_tsne_file, sep='\t', index_col=0) + + # out + if not os.path.exists(outdir): + os.system(f'mkdir -p {outdir}') + self.out_prefix = f'{outdir}/{sample}' + self.out_plot_file = f'{self.out_prefix}_VID_tsne.png' + + @utils.add_log + def plot_vid(self): + def set_label(row): + for vid in self.vid_list: + row["VIDs"] = "wild_type" + if vid in row["VID"]: + row["VIDs"] = "mutation" + break + return row + df = self.df_vid_tsne.apply(set_label, axis=1) + barcode_list = df.loc[df["VIDs"]=="mutation",]["barcode"] + self.df_match_tsne["VIDs"] = "wild_type" + self.df_match_tsne.loc[barcode_list, "VIDs"] = "mutation" + plot = ggplot(self.df_match_tsne, aes(x="tSNE_1",y="tSNE_2",color="VIDs")) + geom_point(size=0.2) + plot.save(self.out_plot_file) + + +def main(): + parser = argparse.ArgumentParser(description='plot snp', formatter_class=ArgFormatter) + parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True) + parser.add_argument("--outdir", help="output dir", default='plot_VID') + args = parser.parse_args() + + sample_vid_dict, sample_match_dir_dict = parse_mapfile(args.mapfile) + for sample in sample_vid_dict: + vid_list = sample_vid_dict[sample] + match_dir = sample_match_dir_dict[sample] + + runner = Plot_vid( + sample=sample, + outdir=args.outdir, + vid_list=vid_list, + snp_dir=sample, + match_dir=match_dir + ) + runner.plot_vid() + + +if __name__ == '__main__': + main() -- Gitee From 13fbbbf809ad870c96cdd5b7304ebd450275b26d Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 13:09:06 +0800 Subject: [PATCH 64/96] scipy 1.5.0 --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 42f0e02c..2ee11454 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ cutadapt==1.17 pysam==0.16.0.1 -scipy==1.4.1 +scipy==1.5.0 numpy==1.19.5 jinja2>=2.10 xopen>=0.5.0 -- Gitee From 058009ad7f5a6a3049ac03e25ba5c73494a04219 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 13:36:25 +0800 Subject: [PATCH 65/96] fix --- celescope/tools/count.py | 2 +- celescope/tools/utils.py | 2 +- requirements.txt | 1 + 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index 26ea2be7..b532f646 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -347,7 +347,7 @@ class Count(Step): barcodes = df_UMI.index.levels[1].to_series() genes.to_csv(f'{matrix_dir}/{FEATURE_FILE_NAME}', index=False, sep='\t', header=False) - barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t') + barcodes.to_csv(f'{matrix_dir}/{BARCODE_FILE_NAME}', index=False, sep='\t', header=False) mmwrite(f'{matrix_dir}/{MATRIX_FILE_NAME}', mtx) @utils.add_log diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index e613b0d8..72ecf120 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -427,7 +427,7 @@ def parse_map_col4(mapfile, default_val): library_id = tmp[0] library_path = tmp[1] sample_name = tmp[2] - if len(tmp) == 4: + if len(tmp) >= 4: col4 = tmp[3] else: col4 = default_val diff --git a/requirements.txt b/requirements.txt index 2ee11454..03237668 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ editdistance>=0.5.3 sklearn==0.0 plotly==4.14.3 plotnine==0.8.0 +cython -- Gitee From 3362569192d61a271d978afaf61991cd4566bd97 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 13:44:20 +0800 Subject: [PATCH 66/96] fix --- celescope/tools/multi.py | 11 ++++++----- celescope/tools/utils.py | 1 + requirements.txt | 1 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index 89e608cc..afdce712 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -5,7 +5,7 @@ import os from collections import defaultdict import celescope -from celescope.tools.utils import find_assay_init, find_step_module +import celescope.tools.utils as utils from celescope.celescope import ArgFormatter TOOLS_DIR = os.path.dirname(celescope.tools.__file__) @@ -15,7 +15,7 @@ class Multi(): def __init__(self, assay): self.__ASSAY__ = assay - init_module = find_assay_init(assay) + init_module = utils.find_assay_init(assay) self.__STEPS__ = init_module.__STEPS__ self.__CONDA__ = os.path.basename(os.environ['CONDA_DEFAULT_ENV']) self.__APP__ = 'celescope' @@ -71,11 +71,12 @@ class Multi(): def step_args(self): for step in self.__STEPS__: - step_module = find_step_module(self.__ASSAY__, step) + step_module = utils.find_step_module(self.__ASSAY__, step) func_opts = getattr(step_module, f"get_opts_{step}") func_opts(self.parser, sub_program=False) @staticmethod + @utils.add_log def parse_map_col4(mapfile, default_val): fq_dict = defaultdict(list) col4_dict = {} @@ -86,7 +87,7 @@ class Multi(): continue line_split = line.split() library_id, library_path, sample_name = line_split[:3] - if len(line_split) == 4: + if len(line_split) >= 4: col4 = line_split[3] else: col4 = default_val @@ -165,7 +166,7 @@ job_end self.last_step = step def parse_step_args(self, step): - step_module = find_step_module(self.__ASSAY__, step) + step_module = utils.find_step_module(self.__ASSAY__, step) func_opts = getattr(step_module, f"get_opts_{step}") step_parser = argparse.ArgumentParser(step_module) func_opts(step_parser, sub_program=False) diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 72ecf120..c3085586 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -413,6 +413,7 @@ def get_fq(library_id, library_path): return fq1, fq2 +@add_log def parse_map_col4(mapfile, default_val): fq_dict = defaultdict(list) col4_dict = defaultdict(list) diff --git a/requirements.txt b/requirements.txt index 03237668..adb9cd90 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,4 +8,5 @@ editdistance>=0.5.3 sklearn==0.0 plotly==4.14.3 plotnine==0.8.0 +matplotlib==3.3.0 cython -- Gitee From 81a14141f28eb3119e29ecd89f169e8dfe57c163 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 13:52:53 +0800 Subject: [PATCH 67/96] remove pip install celescope --- .github/workflows/setup.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/setup.yml b/.github/workflows/setup.yml index f35a9637..dedf5c52 100644 --- a/.github/workflows/setup.yml +++ b/.github/workflows/setup.yml @@ -34,7 +34,6 @@ jobs: if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Install run: | - pip install celescope python setup.py install # test -- Gitee From c10e1f8fc491e5a061f22610e4a46c9f3b5376a6 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 13:57:19 +0800 Subject: [PATCH 68/96] fix mutract --- celescope/snp/analysis_snp.py | 2 +- celescope/snp/variant_calling.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/celescope/snp/analysis_snp.py b/celescope/snp/analysis_snp.py index 80ecc3a0..6a109073 100755 --- a/celescope/snp/analysis_snp.py +++ b/celescope/snp/analysis_snp.py @@ -4,11 +4,11 @@ import subprocess import pandas as pd import pysam -from mutract.utils import read_CID import celescope.tools.utils as utils from celescope.tools.analysis_mixin import AnalysisMixin from celescope.tools.step import Step, s_common +from celescope.snp.variant_calling import read_CID class Analysis_variant(Step, AnalysisMixin): diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index d36c83c1..d4fe6df8 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -42,6 +42,12 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): return df +def read_CID(CID_file): + df_index = pd.read_csv(CID_file, sep='\t', index_col=0, dtype=object) + df_valid = df_index[df_index['valid'] == 'True'] + return df_index, df_valid + + class Variant_calling(Step): """ Features @@ -231,9 +237,7 @@ class Variant_calling(Step): all_res.append(res) def read_CID(self): - df_index = pd.read_csv(self.CID_file, sep='\t', index_col=0, dtype=object) - df_valid = df_index[df_index['valid'] == 'True'] - return df_index, df_valid + return read_CID(self.CID_file) @utils.add_log -- Gitee From 543f915daad90125a42d896417f4a579a16d043b Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 14:00:21 +0800 Subject: [PATCH 69/96] remove unused --- celescope/snp/variant_calling.py | 1 - 1 file changed, 1 deletion(-) diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index d4fe6df8..4d3e7324 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -125,7 +125,6 @@ class Variant_calling(Step): for read in samfile: try: barcode = read.get_tag('CB') - UMI = read.get_tag('UB') except KeyError: continue if barcode in self.barcodes: -- Gitee From f0aaf16971f4c9a03cbed484333e556933820c8e Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 16:41:20 +0800 Subject: [PATCH 70/96] Auto chemistry detection failed info --- celescope/tools/barcode.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py index 1c81e3f3..c03b8874 100755 --- a/celescope/tools/barcode.py +++ b/celescope/tools/barcode.py @@ -238,7 +238,11 @@ class Chemistry(): Chemistry.get_chemistry.logger.info(linker_4_dict) if valid_linker_type == 0: print(linker_wrong_dict) - raise Exception('auto chemistry detection failed!') + raise Exception( + 'Auto chemistry detection failed! ' + 'If the sample is from Singleron, ask the technical staff you are connecting with for the chemistry used. ' + 'You need to use `--chemistry scopeV1` for scopeV1, and `--chemistry auto` should be fine for scopeV2.* ' + ) elif valid_linker_type == 1: chemistry = 'scopeV2.1.1' elif valid_linker_type < 4: -- Gitee From 67a38c1653b8219811d0706d789ee9404c00b55e Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 16:59:38 +0800 Subject: [PATCH 71/96] add log file --- celescope/tools/multi.py | 3 ++- celescope/tools/utils.py | 15 ++++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index afdce712..9fed21b8 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -130,7 +130,8 @@ class Multi(): # mk log dir self.logdir = self.args.outdir + '/log' - os.system('mkdir -p %s' % (self.logdir)) + if self.args.mod == 'sjm': + os.system('mkdir -p %s' % (self.logdir)) # script init self.sjm_cmd = 'log_dir %s\n' % (self.logdir) diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index c3085586..a74388ea 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -30,15 +30,20 @@ def add_log(func): ''' logging start and done. ''' - logging.basicConfig( - level=logging.INFO, - stream=sys.stdout, - format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' - ) + logFormatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s') + module = func.__module__ name = func.__name__ logger_name = f'{module}.{name}' logger = logging.getLogger(logger_name) + logger.setLevel(logging.INFO) + + fileHandler = logging.FileHandler("./celescope_log.txt") + fileHandler.setFormatter(logFormatter) + logger.addHandler(fileHandler) + consoleHandler = logging.StreamHandler(sys.stdout) + consoleHandler.setFormatter(logFormatter) + logger.addHandler(consoleHandler) @wraps(func) def wrapper(*args, **kwargs): -- Gitee From af2c041ce802f1f3ca039315feb1200a184aea4e Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Wed, 23 Jun 2021 17:31:06 +0800 Subject: [PATCH 72/96] docs --- .gitignore | 1 + celescope/rna/mkref.py | 15 ++++++++++++--- celescope/tag/split_tag.py | 13 ++++++++++--- docs/manual.md | 1 + docs/rna/mkref.md | 7 ++++--- docs/tag/split_tag.md | 26 ++++++++++++++++++++++++++ 6 files changed, 54 insertions(+), 9 deletions(-) create mode 100644 docs/tag/split_tag.md diff --git a/.gitignore b/.gitignore index 30d2a341..3907b7e2 100755 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ # test output +celescope_log.txt test_output/ # vscode diff --git a/celescope/rna/mkref.py b/celescope/rna/mkref.py index 26a4311c..606547b4 100755 --- a/celescope/rna/mkref.py +++ b/celescope/rna/mkref.py @@ -96,11 +96,20 @@ def mkref(args): def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: - parser.add_argument("--fasta", help="Required. Genome fasta file.", required=True) - parser.add_argument("--gtf", help="Required. Genome gtf file.", required=True) + parser.add_argument( + "--fasta", + help="Required. Genome fasta file. Must be relative file path to genomeDir.", + required=True + ) + parser.add_argument( + "--gtf", + help="Required. Genome gtf file. Must be relative file path to genomeDir.", + required=True + ) parser.add_argument( "--mt_gene_list", - help="""Mitochondria gene list file. It is a plain text file with one gene per line. + help="""Mitochondria gene list file. Must be relative file path to genomeDir. +It is a plain text file with one gene per line. If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""", default="None" ) diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py index 0cfb5ccb..aa3bb587 100644 --- a/celescope/tag/split_tag.py +++ b/celescope/tag/split_tag.py @@ -13,6 +13,13 @@ from celescope.tools.step import Step, s_common from celescope.__init__ import HELP_DICT class Split_tag(Step): + """ + Features + - Split scRNA-Seq fastq according to tag assignment. + + Output + - `fastq/{tag}_{1,2}.fq` Fastq files of each tag. + """ def __init__(self, args, step_name): Step.__init__(self, args, step_name) @@ -85,12 +92,12 @@ def split_tag(args): def get_opts_split_tag(parser, sub_program): parser.add_argument( "--split_fastq", - help="Split scRNA-Seq fastq file(01.barcode/{sample}_2.fq).", + help="If used, will split scRNA-Seq fastq file according to tag assignment.", action='store_true', ) if sub_program: - parser.add_argument("--umi_tag_file", help="UMI tag file", required=True) + parser.add_argument("--umi_tag_file", help="UMI tag file.", required=True) parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True) - parser.add_argument("--R1_read", help='R1 read path') + parser.add_argument("--R1_read", help='R1 read path.') s_common(parser) diff --git a/docs/manual.md b/docs/manual.md index 2412e3fe..e9caf430 100755 --- a/docs/manual.md +++ b/docs/manual.md @@ -35,3 +35,4 @@ Currently, CeleScope includes the follwing pipelines: - [mapping_tag](tag/mapping_tag.md) - [count_tag](tag/count_tag.md) - [analysis_tag](tag/analysis_tag.md) +- [split_tag](tag/split_tag.md) diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md index 0ab50b68..c1b3d592 100644 --- a/docs/rna/mkref.md +++ b/docs/rna/mkref.md @@ -28,10 +28,11 @@ refflat = Homo_sapiens_ensembl_99.refFlat `--dry_run` Only write config file and exit. -`--fasta` Required. Genome fasta file. +`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir. -`--gtf` Required. Genome gtf file. +`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir. -`--mt_gene_list` Mitochondria gene list file. It is a plain text file with one gene per line. +`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir. +It is a plain text file with one gene per line. If not provided, will use `MT-` and `mt-` to determine mitochondria genes. diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md new file mode 100644 index 00000000..5a43f7f8 --- /dev/null +++ b/docs/tag/split_tag.md @@ -0,0 +1,26 @@ +## Features +- Split scRNA-Seq fastq according to tag assignment. + +## Output +- `fastq/{tag}_{1,2}.fq` Fastq files of each tag. + + +## Arguments +`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment. + +`--umi_tag_file` UMI tag file. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--R1_read` R1 read path. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + -- Gitee From fd73c585029a0969e18b70e9850154e456c2fa83 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 09:14:15 +0800 Subject: [PATCH 73/96] pep8 --- celescope/capture_rna/__init__.py | 2 +- celescope/capture_rna/count_capture_rna.py | 10 +-- celescope/capture_rna/multi_capture_rna.py | 9 +-- celescope/capture_virus/__init__.py | 6 +- .../capture_virus/analysis_capture_virus.py | 7 +- .../capture_virus/count_capture_virus.py | 2 - celescope/capture_virus/mkref.py | 10 +-- .../capture_virus/multi_capture_virus.py | 5 +- celescope/capture_virus/otsu.py | 7 +- celescope/capture_virus/test.py | 6 +- celescope/celescope.py | 3 +- celescope/citeseq/Count_cite.py | 16 ++-- celescope/citeseq/__init__.py | 2 +- celescope/citeseq/analysis_cite.py | 3 +- celescope/citeseq/count_cite.py | 2 +- celescope/citeseq/multi_citeseq.py | 1 - celescope/fusion/count_fusion.py | 3 +- celescope/fusion/mkref.py | 16 ++-- celescope/fusion/multi_fusion.py | 2 - celescope/fusion/star_fusion.py | 1 - celescope/hla/mapping_hla.py | 4 +- celescope/hla/multi_hla.py | 3 +- celescope/hla/test_hla.py | 2 +- celescope/mut/count_mut.py | 3 +- celescope/mut/mapping_mut.py | 2 +- celescope/mut/multi_mut.py | 2 +- celescope/rna/__init__.py | 10 +-- celescope/rna/analysis.py | 16 ++-- celescope/rna/mkref.py | 22 +++--- celescope/rna/multi_rna.py | 2 + celescope/rna/star.py | 13 ++-- celescope/rna_virus/__init__.py | 2 +- celescope/rna_virus/analysis_rna_virus.py | 1 - celescope/rna_virus/count_virus.py | 1 - celescope/rna_virus/multi_rna_virus.py | 3 +- celescope/rna_virus/star_virus.py | 9 +-- celescope/snp/__init__.py | 2 +- celescope/snp/analysis_snp.py | 25 +++---- celescope/snp/mkref.py | 1 - celescope/snp/multi_snp.py | 5 +- celescope/snp/tests/test_variant_calling.py | 23 +++--- celescope/snp/utils/plot_vid.py | 15 ++-- celescope/snp/variant_calling.py | 74 +++++++++---------- celescope/tag/analysis_tag.py | 4 +- celescope/tag/count_tag.py | 38 +++++----- celescope/tag/mapping_tag.py | 16 ++-- celescope/tag/multi_tag.py | 3 +- celescope/tag/split_tag.py | 11 +-- celescope/tag/tests.py | 2 +- celescope/tcr_fl/__init__.py | 2 +- celescope/tcr_fl/assemble.py | 1 + celescope/tcr_fl/barcode_index.py | 3 - celescope/tcr_fl/multi_tcr_fl.py | 1 + celescope/tcr_fl/split_fq.py | 9 ++- celescope/tests/conftest.py | 2 +- celescope/tests/test_function.py | 6 +- celescope/tests/test_multi.py | 2 +- celescope/tools/analysis_mixin.py | 13 ++-- celescope/tools/barcode.py | 40 +++++----- celescope/tools/cellranger3/cell_calling_3.py | 61 +++++++-------- .../tools/cellranger3/get_plot_elements.py | 14 ++-- celescope/tools/cellranger3/sgt.py | 9 ++- celescope/tools/cellranger3/stats.py | 27 +++---- celescope/tools/consensus.py | 16 ++-- celescope/tools/count.py | 11 +-- celescope/tools/cutadapt.py | 21 +++--- celescope/tools/debug.py | 3 - celescope/tools/featureCounts.py | 6 +- celescope/tools/mkref.py | 2 +- celescope/tools/multi.py | 17 ++--- celescope/tools/report.py | 4 +- celescope/tools/sample.py | 6 +- celescope/tools/star_mixin.py | 25 ++++--- celescope/tools/step.py | 20 ++--- celescope/tools/target_metrics.py | 5 +- celescope/tools/tests.py | 9 ++- celescope/tools/utils.py | 64 ++++++++-------- celescope/vdj/__init__.py | 2 +- celescope/vdj/mapping_vdj.py | 15 ++-- celescope/vdj/multi_vdj.py | 4 +- 80 files changed, 407 insertions(+), 440 deletions(-) diff --git a/celescope/capture_rna/__init__.py b/celescope/capture_rna/__init__.py index 9197e1fc..0f9e4577 100755 --- a/celescope/capture_rna/__init__.py +++ b/celescope/capture_rna/__init__.py @@ -12,4 +12,4 @@ __STEPS__ = [ IMPORT_DICT = { 'star': 'celescope.rna', 'analysis': 'celescope.rna', -} \ No newline at end of file +} diff --git a/celescope/capture_rna/count_capture_rna.py b/celescope/capture_rna/count_capture_rna.py index eb54a58f..05062587 100755 --- a/celescope/capture_rna/count_capture_rna.py +++ b/celescope/capture_rna/count_capture_rna.py @@ -10,7 +10,7 @@ from celescope.tools.count import Count, get_opts_count class Count_capture_rna(Count): - + def bam2table(self): """ read probe file @@ -56,7 +56,7 @@ class Count_capture_rna(Count): read_count = 0 for barcode in probe_gene_count_dict[probe][geneName]: for umi in probe_gene_count_dict[probe][geneName][barcode]: - umi_count += len( probe_gene_count_dict[probe][geneName][barcode]) + umi_count += len(probe_gene_count_dict[probe][geneName][barcode]) read_count += probe_gene_count_dict[probe][geneName][barcode][umi] row_list.append({ 'probe': probe, @@ -67,13 +67,12 @@ class Count_capture_rna(Count): }) df_probe = pd.DataFrame(row_list, - columns=['probe', 'gene', 'barcode_count', 'read_count', 'UMI_count']) + columns=['probe', 'gene', 'barcode_count', 'read_count', 'UMI_count']) df_probe = df_probe.groupby(['probe']).apply( lambda x: x.sort_values('UMI_count', ascending=False) ) return df_probe - def run(self): df_probe = self.bam2table() df_probe.to_csv(f'{self.outdir}/{self.sample}_probe_gene_count.tsv', sep='\t', index=False) @@ -111,7 +110,6 @@ class Count_capture_rna(Count): self.clean_up() - @utils.add_log def count_capture_rna(args): # TODO! @@ -122,4 +120,4 @@ def count_capture_rna(args): def get_opts_count_capture_rna(parser, sub_program): - get_opts_count(parser, sub_program) \ No newline at end of file + get_opts_count(parser, sub_program) diff --git a/celescope/capture_rna/multi_capture_rna.py b/celescope/capture_rna/multi_capture_rna.py index 4de4f671..948e3b02 100755 --- a/celescope/capture_rna/multi_capture_rna.py +++ b/celescope/capture_rna/multi_capture_rna.py @@ -3,7 +3,7 @@ from celescope.tools.multi import Multi class Multi_capture_rna(Multi): - + def count_capture_rna(self, sample): step = 'count_capture_rna' cmd_line = self.get_cmd_line(step, sample) @@ -14,7 +14,7 @@ class Multi_capture_rna(Multi): f'--match_dir {self.col4_dict[sample]} ' ) self.process_cmd(cmd, step, sample, m=10, x=1) - + def analysis(self, sample): step = 'analysis' cmd_line = self.get_cmd_line(step, sample) @@ -30,9 +30,6 @@ def main(): multi = Multi_capture_rna(__ASSAY__) multi.run() + if __name__ == '__main__': main() - - - - diff --git a/celescope/capture_virus/__init__.py b/celescope/capture_virus/__init__.py index 8b7b9b02..8efe430c 100755 --- a/celescope/capture_virus/__init__.py +++ b/celescope/capture_virus/__init__.py @@ -1,5 +1,5 @@ -__STEPS__ = [ - 'mkref', +__STEPS__ = [ + 'mkref', 'sample', 'barcode', 'cutadapt', @@ -12,4 +12,4 @@ __ASSAY__ = 'capture_virus' IMPORT_DICT = { 'star_virus': 'celescope.rna_virus', -} \ No newline at end of file +} diff --git a/celescope/capture_virus/analysis_capture_virus.py b/celescope/capture_virus/analysis_capture_virus.py index f840e01e..0a9bec2a 100755 --- a/celescope/capture_virus/analysis_capture_virus.py +++ b/celescope/capture_virus/analysis_capture_virus.py @@ -19,9 +19,10 @@ def analysis_capture_virus(args): runner = Analysis_capture_virus(args, step_name) runner.run() + def get_opts_analysis_capture_virus(parser, sub_program): - parser.add_argument("--umi_threshold", help='method to find virus UMI threshold', - choices=['otsu', 'none'], default='otsu') + parser.add_argument("--umi_threshold", help='method to find virus UMI threshold', + choices=['otsu', 'none'], default='otsu') if sub_program: s_common(parser) parser.add_argument('--match_dir', help='match_dir', required=True) @@ -58,7 +59,6 @@ class Analysis_capture_virus(Step, AnalysisMixin): self.add_data_item(table_dict=table_dict) self.clean_up() - def get_virus_tsne(self, virus_df): virus_tsne_df = pd.merge(self.tsne_df, virus_df, on="barcode", how="left") virus_tsne_df.to_csv(self.virus_tsne_file, sep='\t') @@ -84,4 +84,3 @@ class Analysis_capture_virus(Step, AnalysisMixin): ) df_thresh = df_virus[df_virus["UMI"] >= threshold] df_thresh.to_csv(self.otsu_virus_file, sep='\t') - diff --git a/celescope/capture_virus/count_capture_virus.py b/celescope/capture_virus/count_capture_virus.py index 74bd1b48..531fe481 100755 --- a/celescope/capture_virus/count_capture_virus.py +++ b/celescope/capture_virus/count_capture_virus.py @@ -46,7 +46,6 @@ def sum_virus(validated_barcodes, virus_bam, @add_log def count_capture_virus(args): - # 检查和创建输出目录 if not os.path.exists(args.outdir): os.system('mkdir -p %s' % (args.outdir)) @@ -71,4 +70,3 @@ def get_opts_count_capture_virus(parser, sub_program): s_common(parser) parser.add_argument('--match_dir', help='matched rna_virus directory', required=True) parser.add_argument('--virus_bam', required=True) - diff --git a/celescope/capture_virus/mkref.py b/celescope/capture_virus/mkref.py index fdd07bd0..eab4ba53 100755 --- a/celescope/capture_virus/mkref.py +++ b/celescope/capture_virus/mkref.py @@ -8,8 +8,8 @@ from celescope.tools.mkref import parse_genomeDir def parse_genomeDir_virus(genomeDir): - return parse_genomeDir(genomeDir, entrys = ('fasta',)) - + return parse_genomeDir(genomeDir, entrys=('fasta',)) + class Mkref_virus(Mkref): def __init__(self, genome_type, args): @@ -41,7 +41,7 @@ class Mkref_virus(Mkref): genome['genomeSAindexNbases'] = self.genomeSAindexNbases with open(self.config_file, 'w') as config_handle: config.write(config_handle) - + def run(self): if not self.dry_run: self.build_star_index() @@ -57,5 +57,5 @@ def mkref(args): def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: - parser.add_argument("--fasta", help="virus fasta file",required=True) - parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4) \ No newline at end of file + parser.add_argument("--fasta", help="virus fasta file", required=True) + parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4) diff --git a/celescope/capture_virus/multi_capture_virus.py b/celescope/capture_virus/multi_capture_virus.py index 8c5019a8..faf71f06 100755 --- a/celescope/capture_virus/multi_capture_virus.py +++ b/celescope/capture_virus/multi_capture_virus.py @@ -29,7 +29,7 @@ class Multi_capture_virus(Multi): ) self.process_cmd(cmd, step, sample, m=5, x=1) - def analysis_capture_virus(self, sample): + def analysis_capture_virus(self, sample): step = 'analysis_capture_virus' cmd_line = self.get_cmd_line(step, sample) virus_file = f'{self.outdir_dic[sample]["count_capture_virus"]}/{sample}_virus_UMI_count.tsv' @@ -48,6 +48,3 @@ def main(): if __name__ == '__main__': main() - - - diff --git a/celescope/capture_virus/otsu.py b/celescope/capture_virus/otsu.py index 8f65a4dc..5a555a55 100755 --- a/celescope/capture_virus/otsu.py +++ b/celescope/capture_virus/otsu.py @@ -1,8 +1,8 @@ +import matplotlib.pyplot as plt import matplotlib import numpy as np matplotlib.use('Agg') -import matplotlib.pyplot as plt def threshold_otsu(hist): @@ -44,8 +44,8 @@ def threshold_otsu(hist): def array2hist(array, binWidth=0.2): - counts,bins = np.histogram(array, bins=np.arange(0,max(array)+binWidth,binWidth)) - return counts,bins + counts, bins = np.histogram(array, bins=np.arange(0, max(array)+binWidth, binWidth)) + return counts, bins def makePlot(hist, thresh, fname): @@ -54,4 +54,3 @@ def makePlot(hist, thresh, fname): plt.axvline(thresh, color='r') plt.savefig(fname) plt.close() - diff --git a/celescope/capture_virus/test.py b/celescope/capture_virus/test.py index ac7ba8e4..84f974f5 100755 --- a/celescope/capture_virus/test.py +++ b/celescope/capture_virus/test.py @@ -13,9 +13,9 @@ class test_capture(unittest.TestCase): def test_otsu(self): count_files = [ -'/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv', -'/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_Beads_Manual_KZ/04.count_capture_virus/virus_test3_R_A_Beads_Manual_KZ_virus_UMI_count.tsv', -'/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_3Mins_Manual_KZ/04.count_capture_virus/virus_test3_R_A_3Mins_Manual_KZ_virus_UMI_count.tsv' + '/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv', + '/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_Beads_Manual_KZ/04.count_capture_virus/virus_test3_R_A_Beads_Manual_KZ_virus_UMI_count.tsv', + '/SGRNJ02/RandD4/virus_panel/20210124/virus_test3_R_A_3Mins_Manual_KZ/04.count_capture_virus/virus_test3_R_A_3Mins_Manual_KZ_virus_UMI_count.tsv' ] count_file = '/SGRNJ02/RandD4/virus_panel/20210124_4/S1225_EBV_Skin_Auto_SDF_NEB/07.count_virus/S1225_EBV_Skin_Auto_SDF_NEB_virus_UMI_count.tsv' for count_file in count_files: diff --git a/celescope/celescope.py b/celescope/celescope.py index 14b3b568..3a8d6308 100755 --- a/celescope/celescope.py +++ b/celescope/celescope.py @@ -7,10 +7,11 @@ from celescope.__init__ import __VERSION__, ASSAY_DICT class ArgFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawTextHelpFormatter): pass + def main(): """celescope cli """ - parser = argparse.ArgumentParser(description='CeleScope',formatter_class=ArgFormatter) + parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter) parser.add_argument('-v', '--version', action='version', version=__VERSION__) subparsers = parser.add_subparsers() diff --git a/celescope/citeseq/Count_cite.py b/celescope/citeseq/Count_cite.py index 7c410407..d7710829 100755 --- a/celescope/citeseq/Count_cite.py +++ b/celescope/citeseq/Count_cite.py @@ -17,7 +17,7 @@ class Count_cite(): assay, read_count_file, match_dir, - ): + ): self.sample = sample self.outdir = outdir self.assay = assay @@ -26,7 +26,7 @@ class Count_cite(): self.match_barcode, self.cell_total = read_barcode_file(match_dir) self.df_read_count = pd.read_csv(read_count_file, sep="\t", index_col=0) self.tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] - + if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) @@ -88,9 +88,9 @@ class Count_cite(): self.stats.to_csv(self.stat_file, sep=':', header=False) t = reporter( - name='count_cite', - assay=self.assay, - sample=self.sample, - stat_file=self.stat_file, - outdir=self.outdir + '/..') - t.get_report() \ No newline at end of file + name='count_cite', + assay=self.assay, + sample=self.sample, + stat_file=self.stat_file, + outdir=self.outdir + '/..') + t.get_report() diff --git a/celescope/citeseq/__init__.py b/celescope/citeseq/__init__.py index d73510e3..86a5145c 100755 --- a/celescope/citeseq/__init__.py +++ b/celescope/citeseq/__init__.py @@ -3,4 +3,4 @@ __ASSAY__ = 'citeseq' IMPORT_DICT = { 'mapping_tag': 'celescope.tag' -} \ No newline at end of file +} diff --git a/celescope/citeseq/analysis_cite.py b/celescope/citeseq/analysis_cite.py index 2c5f7403..8f061723 100755 --- a/celescope/citeseq/analysis_cite.py +++ b/celescope/citeseq/analysis_cite.py @@ -18,7 +18,7 @@ def analysis_cite(args): if not os.path.exists(args.outdir): os.system('mkdir -p %s' % args.outdir) - + rds = parse_match_dir(args.match_dir)['rds'] app = CITESEQ_DIR + "/analysis_cite.R" cmd = ( @@ -29,4 +29,3 @@ def analysis_cite(args): f'--sample {args.sample} ' ) os.system(cmd) - \ No newline at end of file diff --git a/celescope/citeseq/count_cite.py b/celescope/citeseq/count_cite.py index b0a6486a..d2563202 100755 --- a/celescope/citeseq/count_cite.py +++ b/celescope/citeseq/count_cite.py @@ -20,4 +20,4 @@ def count_cite(args): args.match_dir, ) count_cite_object.run() - count_cite_object.report() \ No newline at end of file + count_cite_object.report() diff --git a/celescope/citeseq/multi_citeseq.py b/celescope/citeseq/multi_citeseq.py index b1a64081..0ec65bc9 100755 --- a/celescope/citeseq/multi_citeseq.py +++ b/celescope/citeseq/multi_citeseq.py @@ -4,4 +4,3 @@ def main(): # TODO pass - diff --git a/celescope/fusion/count_fusion.py b/celescope/fusion/count_fusion.py index ab81745d..b4facae2 100755 --- a/celescope/fusion/count_fusion.py +++ b/celescope/fusion/count_fusion.py @@ -115,9 +115,8 @@ class CountFusion(Step): os.system(cmd) count_fusion.logger.info("plot done.") - def run(self): - self.count_fusion() + self.count_fusion() self.clean_up() diff --git a/celescope/fusion/mkref.py b/celescope/fusion/mkref.py index 0e7c7841..17134c48 100755 --- a/celescope/fusion/mkref.py +++ b/celescope/fusion/mkref.py @@ -8,8 +8,8 @@ from celescope.tools.mkref import parse_genomeDir def parse_genomeDir_fusion(genomeDir): - return parse_genomeDir(genomeDir, entrys = ('fasta','fusion_pos')) - + return parse_genomeDir(genomeDir, entrys=('fasta', 'fusion_pos')) + class Mkref_fusion(Mkref): def __init__(self, genome_type, args): @@ -43,7 +43,7 @@ class Mkref_fusion(Mkref): genome['genomeSAindexNbases'] = self.genomeSAindexNbases with open(self.config_file, 'w') as config_handle: config.write(config_handle) - + def run(self): if not self.dry_run: self.build_star_index() @@ -59,9 +59,9 @@ def mkref(args): def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: - parser.add_argument("--fasta", help="fusion fasta file",required=True) + parser.add_argument("--fasta", help="fusion fasta file", required=True) parser.add_argument( - "--fusion_pos", + "--fusion_pos", help=""" fusion position file. A two column tab-delimited text file with header. "pos" is the end postion of the first gene(1-based). @@ -71,6 +71,6 @@ PML_3\t183 PML_4\t254 PML_5\t326 PML_6\t204 -""", - required=True,) - parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4) \ No newline at end of file +""", + required=True,) + parser.add_argument("--genomeSAindexNbases", help="STAR genomeSAindexNbases", default=4) diff --git a/celescope/fusion/multi_fusion.py b/celescope/fusion/multi_fusion.py index b82cd4be..b69de656 100755 --- a/celescope/fusion/multi_fusion.py +++ b/celescope/fusion/multi_fusion.py @@ -33,5 +33,3 @@ def main(): if __name__ == '__main__': main() - - diff --git a/celescope/fusion/star_fusion.py b/celescope/fusion/star_fusion.py index 54c02d8b..c70494f9 100755 --- a/celescope/fusion/star_fusion.py +++ b/celescope/fusion/star_fusion.py @@ -26,4 +26,3 @@ def get_opts_star_fusion(parser, sub_program): # will cause `conflicting option string: --genomeDir` # parser.add_argument('--genomeDir', help=argparse.SUPPRESS) parser.add_argument('--fusion_genomeDir', help='fusion gene STAR index genome directory', required=True) - diff --git a/celescope/hla/mapping_hla.py b/celescope/hla/mapping_hla.py index db08e6ba..bce9c4e1 100755 --- a/celescope/hla/mapping_hla.py +++ b/celescope/hla/mapping_hla.py @@ -148,10 +148,10 @@ def hla_typing(index_file, outdir, thread): @add_log def summary(index_file, outdir, sample): - + n = 0 df_valid = read_index(index_file) - + for index in df_valid.index: try: sub_df = pd.read_csv( diff --git a/celescope/hla/multi_hla.py b/celescope/hla/multi_hla.py index f5f4cc8b..21e44651 100755 --- a/celescope/hla/multi_hla.py +++ b/celescope/hla/multi_hla.py @@ -1,7 +1,8 @@ def main(): - #TODO + # TODO pass + if __name__ == '__main__': main() diff --git a/celescope/hla/test_hla.py b/celescope/hla/test_hla.py index 284c856b..a50e3ccc 100755 --- a/celescope/hla/test_hla.py +++ b/celescope/hla/test_hla.py @@ -30,7 +30,7 @@ class testHLA(unittest.TestCase): def test_read_index(self): read_index(self.index_file) - #@unittest.skip('pass') + # @unittest.skip('pass') def test_summary(self): summary(self.index_file, self.mapping_outdir, self.sample) diff --git a/celescope/mut/count_mut.py b/celescope/mut/count_mut.py index 094ce94c..aa72abd3 100755 --- a/celescope/mut/count_mut.py +++ b/celescope/mut/count_mut.py @@ -50,7 +50,6 @@ def count_mut(args): mut_dic = read_mut(mut_file) out_prefix = outdir + "/" + sample - # tsne match_dict = parse_match_dir(match_dir) df_tsne = pd.read_csv(match_dict['tsne_coord'], sep="\t", index_col=0) @@ -142,7 +141,7 @@ def count_mut(args): out_insertion_barcode_count_file, sep="\t") df_tsne_mut = pd.merge(df_tsne, df_insertion_barcode_count, - right_index=True, left_index=True, how="left") + right_index=True, left_index=True, how="left") df_tsne_mut.fillna(0, inplace=True) df_tsne_mut.to_csv(out_tsne_file, sep="\t") diff --git a/celescope/mut/mapping_mut.py b/celescope/mut/mapping_mut.py index 14d2cc1d..f260af22 100755 --- a/celescope/mut/mapping_mut.py +++ b/celescope/mut/mapping_mut.py @@ -25,4 +25,4 @@ def get_opts_mapping_mut(parser, sub_program): help='insertion or deletion STAR indexed genome directory', required=True) parser.add_argument("--thread", help='STAR thread', default=1) - parser.add_argument("--outFilterMatchNmin", help='STAR outFilterMatchNmin', default=35) \ No newline at end of file + parser.add_argument("--outFilterMatchNmin", help='STAR outFilterMatchNmin', default=35) diff --git a/celescope/mut/multi_mut.py b/celescope/mut/multi_mut.py index f77d0c01..7fc122f3 100755 --- a/celescope/mut/multi_mut.py +++ b/celescope/mut/multi_mut.py @@ -43,6 +43,6 @@ def main(): multi = Multi_mut(__ASSAY__) multi.run() + if __name__ == '__main__': main() - diff --git a/celescope/rna/__init__.py b/celescope/rna/__init__.py index c437cef4..d1b579b8 100755 --- a/celescope/rna/__init__.py +++ b/celescope/rna/__init__.py @@ -9,11 +9,11 @@ __STEPS__ = [ 'analysis'] __ASSAY__ = 'rna' -# m: memory +# m: memory # x: thread RESOURCE = { - 'sample': {'m':1, 'x':1}, - 'barcode': {'m':5, 'x':1}, - 'cutadapt': {'m':5, 'x':1}, - 'star': {'m':30, 'x':1}, + 'sample': {'m': 1, 'x': 1}, + 'barcode': {'m': 5, 'x': 1}, + 'cutadapt': {'m': 5, 'x': 1}, + 'star': {'m': 30, 'x': 1}, } diff --git a/celescope/rna/analysis.py b/celescope/rna/analysis.py index 560b3768..8045e329 100755 --- a/celescope/rna/analysis.py +++ b/celescope/rna/analysis.py @@ -35,11 +35,12 @@ class Analysis_rna(Step, AnalysisMixin): - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` parameter is provided. The result contains 3 files: - - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", + - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", it means that the given marker is not enough to identify the cluster. - - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. - - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) + - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. + - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) """ + def __init__(self, args, step_name): Step.__init__(self, args, step_name) AnalysisMixin.__init__(self, args) @@ -77,7 +78,7 @@ def get_opts_analysis(parser, sub_program): parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True) parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.') parser.add_argument( - '--type_marker_tsv', + '--type_marker_tsv', help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example: ``` cell_type marker @@ -94,11 +95,8 @@ LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" ) if sub_program: parser.add_argument( - '--matrix_file', - help='Required. Matrix_10X directory from step count.', + '--matrix_file', + help='Required. Matrix_10X directory from step count.', required=True, ) parser = s_common(parser) - - - diff --git a/celescope/rna/mkref.py b/celescope/rna/mkref.py index 606547b4..6e8ff3ce 100755 --- a/celescope/rna/mkref.py +++ b/celescope/rna/mkref.py @@ -7,8 +7,8 @@ from celescope.tools.mkref import get_opts_mkref as opts def parse_genomeDir_rna(genomeDir): - return parse_genomeDir(genomeDir, entrys = ('fasta', 'gtf', 'mt_gene_list')) - + return parse_genomeDir(genomeDir, entrys=('fasta', 'gtf', 'mt_gene_list')) + class Mkref_rna(Mkref): """ @@ -32,13 +32,14 @@ class Mkref_rna(Mkref): refflat = Homo_sapiens_ensembl_99.refFlat ``` """ + def __init__(self, genome_type, args): Mkref.__init__(self, genome_type, args) self.fasta = args.fasta self.gtf = args.gtf self.mt_gene_list = args.mt_gene_list - # out file + # out file self.refflat = f'{self.genome_name}.refFlat' @utils.add_log @@ -79,7 +80,7 @@ class Mkref_rna(Mkref): ) Mkref_rna.build_refflat.logger.info(cmd) subprocess.check_call(cmd, shell=True) - + @utils.add_log def run(self): if not self.dry_run: @@ -87,6 +88,7 @@ class Mkref_rna(Mkref): self.build_star_index() self.write_config() + def mkref(args): genome_type = 'rna' runner = Mkref_rna(genome_type, args) @@ -97,19 +99,19 @@ def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: parser.add_argument( - "--fasta", - help="Required. Genome fasta file. Must be relative file path to genomeDir.", + "--fasta", + help="Required. Genome fasta file. Must be relative file path to genomeDir.", required=True ) parser.add_argument( - "--gtf", - help="Required. Genome gtf file. Must be relative file path to genomeDir.", + "--gtf", + help="Required. Genome gtf file. Must be relative file path to genomeDir.", required=True ) parser.add_argument( - "--mt_gene_list", + "--mt_gene_list", help="""Mitochondria gene list file. Must be relative file path to genomeDir. It is a plain text file with one gene per line. -If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""", +If not provided, will use `MT-` and `mt-` to determine mitochondria genes.""", default="None" ) diff --git a/celescope/rna/multi_rna.py b/celescope/rna/multi_rna.py index 0e995e8b..cbea5d52 100755 --- a/celescope/rna/multi_rna.py +++ b/celescope/rna/multi_rna.py @@ -5,9 +5,11 @@ from celescope.tools.multi import Multi class Multi_rna(Multi): pass + def main(): multi = Multi_rna(__ASSAY__) multi.run() + if __name__ == '__main__': main() diff --git a/celescope/rna/star.py b/celescope/rna/star.py index 3e9e1f4d..9f1512e1 100755 --- a/celescope/rna/star.py +++ b/celescope/rna/star.py @@ -67,7 +67,7 @@ class Star_rna(Step, StarMixin): data = picard_log.readline().strip().split('\t') region_dict = dict(zip(header, data)) break - + total = float(region_dict['PF_ALIGNED_BASES']) exonic_regions = int(region_dict['UTR_BASES']) + \ int(region_dict['CODING_BASES']) @@ -75,8 +75,8 @@ class Star_rna(Step, StarMixin): intergenic_regions = int(region_dict['INTERGENIC_BASES']) self.add_metric( - name='Base Pairs Mapped to Exonic Regions', - value=exonic_regions, + name='Base Pairs Mapped to Exonic Regions', + value=exonic_regions, total=total, ) self.add_metric( @@ -86,7 +86,7 @@ class Star_rna(Step, StarMixin): ) self.add_metric( name='Base Pairs Mapped to Intergenic Regions', - value=intergenic_regions, + value=intergenic_regions, total=total, ) @@ -107,10 +107,9 @@ class Star_rna(Step, StarMixin): ) region_plot = {'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'], - 'region_values': [exonic_regions, intronic_regions, intergenic_regions]} + 'region_values': [exonic_regions, intronic_regions, intergenic_regions]} self.add_content_item("data", STAR_plot=region_plot) - @utils.add_log def ribo(self): # TODO remove bbduk.sh and use picard ribo bases @@ -159,4 +158,4 @@ def star(args): def get_opts_star(parser, sub_program): - get_opts_star_mixin(parser, sub_program) \ No newline at end of file + get_opts_star_mixin(parser, sub_program) diff --git a/celescope/rna_virus/__init__.py b/celescope/rna_virus/__init__.py index 0601c4ca..3bf7e213 100755 --- a/celescope/rna_virus/__init__.py +++ b/celescope/rna_virus/__init__.py @@ -12,4 +12,4 @@ __STEPS__ = [ __ASSAY__ = 'rna_virus' IMPORT_DICT = { 'star': 'celescope.rna' -} \ No newline at end of file +} diff --git a/celescope/rna_virus/analysis_rna_virus.py b/celescope/rna_virus/analysis_rna_virus.py index 2548e30f..5da51724 100755 --- a/celescope/rna_virus/analysis_rna_virus.py +++ b/celescope/rna_virus/analysis_rna_virus.py @@ -124,4 +124,3 @@ def get_opts_analysis_rna_virus(parser, sub_program): '--virus_file', help='virus UMI count file', required=True) - \ No newline at end of file diff --git a/celescope/rna_virus/count_virus.py b/celescope/rna_virus/count_virus.py index 46ab2880..1cd9f04e 100755 --- a/celescope/rna_virus/count_virus.py +++ b/celescope/rna_virus/count_virus.py @@ -76,4 +76,3 @@ def get_opts_count_virus(parser, sub_program): s_common(parser) parser.add_argument('--virus_bam', required=True) parser.add_argument('--barcode_file', required=True) - diff --git a/celescope/rna_virus/multi_rna_virus.py b/celescope/rna_virus/multi_rna_virus.py index b40181b0..b921353e 100755 --- a/celescope/rna_virus/multi_rna_virus.py +++ b/celescope/rna_virus/multi_rna_virus.py @@ -4,7 +4,6 @@ from celescope.tools.multi import Multi class Multi_rna_virus(Multi): - def star_virus(self, sample): step = 'star_virus' fq = f'{self.outdir_dic[sample]["cutadapt"]}/{sample}_clean_2.fq{self.fq_suffix}' @@ -28,7 +27,7 @@ class Multi_rna_virus(Multi): ) self.process_cmd(cmd, step, sample, m=5, x=1) - def analysis_rna_virus(self, sample): + def analysis_rna_virus(self, sample): step = 'analysis_rna_virus' virus_file = f'{self.outdir_dic[sample]["count_virus"]}/{sample}_virus_UMI_count.tsv' matrix_file = f'{self.outdir_dic[sample]["count"]}/{sample}_matrix.tsv.gz' diff --git a/celescope/rna_virus/star_virus.py b/celescope/rna_virus/star_virus.py index 3f6fae9a..905f58b9 100755 --- a/celescope/rna_virus/star_virus.py +++ b/celescope/rna_virus/star_virus.py @@ -7,13 +7,13 @@ class StarVirus(Step, StarMixin): """ star virus class """ + def __init__(self, args, step_name): - # add genomeDir + # add genomeDir args.genomeDir = args.virus_genomeDir - - Step.__init__(self, args, step_name) - StarMixin.__init__(self, args, add_prefix='virus') + Step.__init__(self, args, step_name) + StarMixin.__init__(self, args, add_prefix='virus') def run(self): self.run_star() @@ -30,4 +30,3 @@ def star_virus(args): def get_opts_star_virus(parser, sub_program): get_opts_star_mixin(parser, sub_program) parser.add_argument('--virus_genomeDir', help='virus genome dir', required=True) - diff --git a/celescope/snp/__init__.py b/celescope/snp/__init__.py index c9f17e51..1f9607bf 100755 --- a/celescope/snp/__init__.py +++ b/celescope/snp/__init__.py @@ -1,6 +1,6 @@ __STEPS__ = [ 'mkref', - 'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts', + 'sample', 'barcode', 'cutadapt', 'consensus', 'star', 'featureCounts', 'target_metrics', 'variant_calling', 'analysis_snp' ] __ASSAY__ = 'snp' diff --git a/celescope/snp/analysis_snp.py b/celescope/snp/analysis_snp.py index 6a109073..80aee4a1 100755 --- a/celescope/snp/analysis_snp.py +++ b/celescope/snp/analysis_snp.py @@ -22,7 +22,7 @@ class Analysis_variant(Step, AnalysisMixin): self.annovar_config = args.annovar_config self.match_dir = args.match_dir self.vcf_GT = None - + def get_df_count_tsne(self): ''' output: f'{self.outdir}/{self.sample}_count_tsne.tsv' @@ -30,8 +30,8 @@ class Analysis_variant(Step, AnalysisMixin): df_vc = pd.read_csv(self.variant_count_file, sep='\t') df_vc = df_vc[df_vc["alt_count"] > 0] df_vc_cell = df_vc.groupby('CID').agg({ - 'alt_count':'count', - 'VID':list, + 'alt_count': 'count', + 'VID': list, }) df_CID, _df_valid = read_CID(self.CID_file) @@ -39,7 +39,7 @@ class Analysis_variant(Step, AnalysisMixin): tsne_df_CID = pd.merge(self.tsne_df, df_CID, on='barcode', how='left') df_vc_barcode = pd.merge(df_vc_cell, df_CID, on='CID') - df_vc_barcode_tsne = pd.merge(df_vc_barcode, tsne_df_CID, on=['barcode','CID'], how='right') + df_vc_barcode_tsne = pd.merge(df_vc_barcode, tsne_df_CID, on=['barcode', 'CID'], how='right') df_vc_barcode_tsne['value'] = df_vc_barcode_tsne['alt_count'] df_vc_barcode_tsne['value'] = df_vc_barcode_tsne['value'].fillna(0) df_vc_barcode_tsne['value'].astype('int32') @@ -61,7 +61,7 @@ class Analysis_variant(Step, AnalysisMixin): text = list(df_count_tsne.apply(return_text, axis=1)) value = list(df_count_tsne.value) title = 't-SNE plot Colored by Cell Variant Counts' - count_tsne = {"tSNE_1": tSNE_1, "tSNE_2": tSNE_2, "text": text, 'value':value, 'title':title} + count_tsne = {"tSNE_1": tSNE_1, "tSNE_2": tSNE_2, "text": text, 'value': value, 'title': title} return count_tsne def add_GT(self): @@ -73,28 +73,26 @@ class Analysis_variant(Step, AnalysisMixin): out_vcf = pysam.VariantFile(out_vcf_file, 'w', header=vcf.header) for rec in vcf: for sample in rec.samples: - rec.samples[sample]["GT"] = (1,1) + rec.samples[sample]["GT"] = (1, 1) out_vcf.write(rec) vcf.close() out_vcf.close() self.vcf_GT = out_vcf_file - def get_df_table(self): - - df_vcf = utils.parse_vcf(self.vcf_GT, infos=['VID','CID']) + + df_vcf = utils.parse_vcf(self.vcf_GT, infos=['VID', 'CID']) df_annovar = self.annovar() df_vcf = pd.concat((df_vcf, df_annovar), axis=1) - df_vcf["nCell"] = df_vcf["CID"].apply(func=lambda row:1 if isinstance(row,str) else len(row)) + df_vcf["nCell"] = df_vcf["CID"].apply(func=lambda row: 1 if isinstance(row, str) else len(row)) out_df_vcf = f'{self.outdir}/{self.sample}_variant_table.tsv' df_vcf.to_csv(out_df_vcf, sep='\t', index=False) - cols = ['VID','Chrom','Pos','Alleles','Gene','nCell','mRNA','Protein','COSMIC'] + cols = ['VID', 'Chrom', 'Pos', 'Alleles', 'Gene', 'nCell', 'mRNA', 'Protein', 'COSMIC'] df_vcf = df_vcf[cols] return df_vcf - def run(self): self.add_GT() cluster_tsne = self.get_cluster_tsne(colname='cluster', tsne_df=self.tsne_df) @@ -157,6 +155,7 @@ def analysis_snp(args): step_snp = Analysis_variant(args, step) step_snp.run() + def get_opts_analysis_snp(parser, sub_program): parser.add_argument('--annovar_config', help='annovar soft config file', required=True) if sub_program: @@ -164,4 +163,4 @@ def get_opts_analysis_snp(parser, sub_program): parser.add_argument('--match_dir', help='match_dir', required=True) parser.add_argument('--vcf', help='vcf file', required=True) parser.add_argument('--CID_file', help='CID_file', required=True) - parser.add_argument('--variant_count_file', help='variant count file', required=True) \ No newline at end of file + parser.add_argument('--variant_count_file', help='variant count file', required=True) diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py index a8e0a006..b5632a9f 100644 --- a/celescope/snp/mkref.py +++ b/celescope/snp/mkref.py @@ -66,4 +66,3 @@ def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: parser.add_argument("--fasta", help="fasta file", required=True) - diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index bad75811..69dbc418 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -12,7 +12,7 @@ class Multi_snp(Multi): else: fq = f'{self.outdir_dic[sample]["consensus"]}/{sample}_consensus.fq' cmd_line += ' --consensus_fq ' - + cmd = ( f'{cmd_line} ' f'--fq {fq} ' @@ -30,7 +30,6 @@ class Multi_snp(Multi): ) self.process_cmd(cmd, step, sample, m=2, x=1) - def variant_calling(self, sample): step = 'variant_calling' cmd_line = self.get_cmd_line(step, sample) @@ -62,6 +61,6 @@ def main(): multi = Multi_snp(__ASSAY__) multi.run() + if __name__ == '__main__': main() - diff --git a/celescope/snp/tests/test_variant_calling.py b/celescope/snp/tests/test_variant_calling.py index 5d612b00..cdeb3d55 100644 --- a/celescope/snp/tests/test_variant_calling.py +++ b/celescope/snp/tests/test_variant_calling.py @@ -5,21 +5,22 @@ from celescope.snp.variant_calling import Variant_calling ROOT_DIR = os.path.dirname(__file__) + class Test_variant_calling(unittest.TestCase): def setUp(self): os.chdir(ROOT_DIR) Args = namedtuple("Args", "thread outdir sample assay debug " + "genomeDir vcf bam match_dir") self.args = Args( - thread=10, - outdir="./test_output/07.variant_calling", - sample="test1", - assay="snp", - debug=False, - genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92", - vcf=None, - bam="./test_data/06.target_metrics/subset_filter.bam", - match_dir="./test_data/match_dir", - ) + thread=10, + outdir="./test_output/07.variant_calling", + sample="test1", + assay="snp", + debug=False, + genomeDir="/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92", + vcf=None, + bam="./test_data/06.target_metrics/subset_filter.bam", + match_dir="./test_data/match_dir", + ) def test_run(self): obj = Variant_calling(self.args, "variant_calling") @@ -35,4 +36,4 @@ class Test_variant_calling(unittest.TestCase): obj.write_VID_file() obj.get_UMI() obj.write_support_matrix() - obj.clean_up() \ No newline at end of file + obj.clean_up() diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py index 86f489f5..5211d01a 100644 --- a/celescope/snp/utils/plot_vid.py +++ b/celescope/snp/utils/plot_vid.py @@ -14,21 +14,24 @@ SAMPLE_COL_INDEX = 2 MATCH_DIR_COL_INDEX = 3 VID_COL_INDEX = 4 + @utils.add_log def parse_mapfile(mapfile): sample_vid_dict = {} sample_match_dir_dict = {} df_mapfile = pd.read_csv(mapfile, sep='\t', header=None) - def read_row(row): + + def read_row(row): sample = row[SAMPLE_COL_INDEX] match_dir = row[MATCH_DIR_COL_INDEX] - vid_list = [int(vid) for vid in row[VID_COL_INDEX ].strip().split(',')] + vid_list = [int(vid) for vid in row[VID_COL_INDEX].strip().split(',')] sample_vid_dict[sample] = vid_list sample_match_dir_dict[sample] = match_dir df_mapfile.apply(read_row, axis=1) return sample_vid_dict, sample_match_dir_dict + class Plot_vid(): def __init__(self, sample, outdir, vid_list, snp_dir, match_dir): self.sample = sample @@ -36,9 +39,9 @@ class Plot_vid(): # set vid_tsne_file = glob.glob(f'{snp_dir}/08.analysis_snp/*count_tsne.tsv')[0] - self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID":ast.literal_eval}) + self.df_vid_tsne = pd.read_csv(vid_tsne_file, sep='\t', converters={"VID": ast.literal_eval}) match_tsne_file = glob.glob(f'{match_dir}/*analysis/*tsne_coord.tsv')[0] - self.df_match_tsne = pd.read_csv( match_tsne_file, sep='\t', index_col=0) + self.df_match_tsne = pd.read_csv(match_tsne_file, sep='\t', index_col=0) # out if not os.path.exists(outdir): @@ -56,10 +59,10 @@ class Plot_vid(): break return row df = self.df_vid_tsne.apply(set_label, axis=1) - barcode_list = df.loc[df["VIDs"]=="mutation",]["barcode"] + barcode_list = df.loc[df["VIDs"] == "mutation", ]["barcode"] self.df_match_tsne["VIDs"] = "wild_type" self.df_match_tsne.loc[barcode_list, "VIDs"] = "mutation" - plot = ggplot(self.df_match_tsne, aes(x="tSNE_1",y="tSNE_2",color="VIDs")) + geom_point(size=0.2) + plot = ggplot(self.df_match_tsne, aes(x="tSNE_1", y="tSNE_2", color="VIDs")) + geom_point(size=0.2) plot.save(self.out_plot_file) diff --git a/celescope/snp/variant_calling.py b/celescope/snp/variant_calling.py index 4d3e7324..7429e0c4 100755 --- a/celescope/snp/variant_calling.py +++ b/celescope/snp/variant_calling.py @@ -15,7 +15,6 @@ from celescope.tools.step import Step, s_common from celescope.rna.mkref import parse_genomeDir_rna - def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): ''' parse vcf into df @@ -34,11 +33,11 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles',), infos=('VID',)): rec_dict['alt'] = '.' if len(rec_dict['alleles']) == 2: rec_dict['alt'] = rec_dict['alleles'][1] - + for info in infos: rec_dict[info] = rec.info[info] - df = df.append(pd.Series(rec_dict),ignore_index=True) + df = df.append(pd.Series(rec_dict), ignore_index=True) return df @@ -67,8 +66,8 @@ class Variant_calling(Step): 2 : all reads/UMIs at the position support the alt allele. 3 : one or more reads/UMIs support both the alt and the ref allele. """ - - def __init__(self, args, step_name): + + def __init__(self, args, step_name): Step.__init__(self, args, step_name) # set @@ -80,7 +79,7 @@ class Variant_calling(Step): self.vcf_bool = False self.df_vcf = None - # out + # out self.splitN_bam = f'{self.out_prefix}_splitN.bam' self.CID_file = f'{self.out_prefix}_CID.tsv' self.VID_file = f'{self.out_prefix}_VID.tsv' @@ -88,7 +87,6 @@ class Variant_calling(Step): self.variant_count_file = f'{self.out_prefix}_variant_count.tsv' self.support_matrix_file = f'{self.out_prefix}_support.mtx' - @utils.add_log def SplitNCigarReads(self): cmd = ( @@ -101,7 +99,6 @@ class Variant_calling(Step): Variant_calling.SplitNCigarReads.logger.info(cmd) subprocess.check_call(cmd, shell=True) - @utils.add_log def split_bam(self): ''' @@ -118,7 +115,7 @@ class Variant_calling(Step): bam_dict = defaultdict(list) CID_dict = defaultdict(dict) cells_dir = f'{self.outdir}/cells/' - + # read bam and split samfile = pysam.AlignmentFile(self.splitN_bam, "rb") header = samfile.header @@ -134,7 +131,6 @@ class Variant_calling(Step): # assign read to barcode bam_dict[barcode].append(read) - self.split_bam.logger.info('writing cell bam...') # write new bam CID = 0 @@ -174,7 +170,7 @@ class Variant_calling(Step): f'samtools sort {bam} -o {sorted_bam}' ) subprocess.check_call(cmd_sort, shell=True) - + # mpileup bcf = f'{outdir}/cells/cell{CID}/cell{CID}.bcf' cmd_mpileup = ( @@ -238,7 +234,6 @@ class Variant_calling(Step): def read_CID(self): return read_CID(self.CID_file) - @utils.add_log def merge_vcf(self): ''' @@ -256,7 +251,7 @@ class Variant_calling(Step): for CID in CIDs: CID = str(CID) vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' - vcf = pysam.VariantFile(vcf_file,'r') + vcf = pysam.VariantFile(vcf_file, 'r') for rec in vcf.fetch(): v = ','.join([str(getattr(rec, col)) for col in v_cols]) if not v in v_dict: @@ -270,12 +265,12 @@ class Variant_calling(Step): def get_vcf_header(CIDs): CID = CIDs[0] vcf_file = f'{self.outdir}/cells/cell{CID}/cell{CID}_norm.vcf' - vcf = pysam.VariantFile(vcf_file,'r') + vcf = pysam.VariantFile(vcf_file, 'r') return vcf.header vcf_header = get_vcf_header(CIDs) vcf_header.info.add('VID', number=1, type='String', description='Variant ID') vcf_header.info.add('CID', number=1, type='String', description='Cell ID') - merged_vcf = pysam.VariantFile(self.final_vcf_file,'w', header=vcf_header) + merged_vcf = pysam.VariantFile(self.final_vcf_file, 'w', header=vcf_header) VID = 0 for v in sorted(v_dict.keys()): @@ -285,7 +280,7 @@ class Variant_calling(Step): record = merged_vcf.new_record() cols = ['chrom', 'pos', 'alleles'] for col in cols: - setattr(record,col, getattr(rec,col)) + setattr(record, col, getattr(rec, col)) record.info['VID'] = str(VID) record.info['CID'] = CID merged_vcf.write(record) @@ -294,12 +289,12 @@ class Variant_calling(Step): @utils.add_log def write_VID_file(self): df_vcf = parse_vcf(self.final_vcf_file) - df_VID = df_vcf.loc[:,['VID', 'chrom', 'pos', 'ref', 'alt']] + df_VID = df_vcf.loc[:, ['VID', 'chrom', 'pos', 'ref', 'alt']] df_VID.to_csv(self.VID_file, sep='\t', index=False) @utils.add_log def add_VID(self): - vcf = pysam.VariantFile(self.args.vcf,'r') + vcf = pysam.VariantFile(self.args.vcf, 'r') vcf_header = vcf.header if 'VID' in vcf_header.info: logging.info('VID is already in vcf file!') @@ -309,7 +304,7 @@ class Variant_calling(Step): VID = 0 for rec in vcf.fetch(): VID += 1 - rec.info['VID'] = str(VID) + rec.info['VID'] = str(VID) VID_vcf.write(rec) VID_vcf.close() @@ -323,9 +318,9 @@ class Variant_calling(Step): def get_DP4(row, alt): DP4 = row['DP4'].iloc[0] if alt == 'ref': - indexs = [0,1] + indexs = [0, 1] elif alt == 'alt': - indexs = [2,3] + indexs = [2, 3] umi = sum([DP4[index] for index in indexs]) return umi @@ -333,9 +328,9 @@ class Variant_calling(Step): pos = row['pos'] chrom = row['chrom'] alt = row['alt'] - df_pos = df_cell_vcf[(df_cell_vcf['pos']==pos) & (df_cell_vcf['chrom']==chrom)] - df_ref = df_pos[df_pos['alt']=='.'] - df_alt = df_pos[df_pos['alt']==alt] + df_pos = df_cell_vcf[(df_cell_vcf['pos'] == pos) & (df_cell_vcf['chrom'] == chrom)] + df_ref = df_pos[df_pos['alt'] == '.'] + df_alt = df_pos[df_pos['alt'] == alt] ref_UMI = 0 alt_UMI = 0 if df_ref.shape[0] != 0: @@ -345,15 +340,15 @@ class Variant_calling(Step): return ref_UMI, alt_UMI, pos, chrom, alt for index in df_vcf.index: - row = df_vcf.loc[index,] + row = df_vcf.loc[index, ] ref_UMI, alt_UMI, _pos, _chrom, _alt = map_vcf_row(row, df_cell_vcf) if (ref_UMI + alt_UMI) != 0: VID = row['VID'] dic = { - 'VID':VID, - 'CID':CID, - 'ref_count':ref_UMI, - 'alt_count':alt_UMI, + 'VID': VID, + 'CID': CID, + 'ref_count': ref_UMI, + 'alt_count': alt_UMI, } df_UMI = df_UMI.append(dic, ignore_index=True) return df_UMI @@ -363,7 +358,7 @@ class Variant_calling(Step): ''' get variant and ref UMI supporting an allele ''' - _df_index, df_valid = self.read_CID() + _df_index, df_valid = self.read_CID() df_UMI_list = [] CID_arg = list(df_valid.index) @@ -372,12 +367,12 @@ class Variant_calling(Step): with ProcessPoolExecutor(self.thread) as pool: for res in pool.map(Variant_calling.cell_UMI, CID_arg, outdir_arg, final_vcf_file_arg): df_UMI_list.append(res) - + df_UMI = pd.concat(df_UMI_list) df_UMI['VID'] = df_UMI['VID'].astype('int') - df_UMI.sort_values(by=['VID','CID'], inplace=True) + df_UMI.sort_values(by=['VID', 'CID'], inplace=True) df_UMI.to_csv(self.variant_count_file, sep='\t', index=False) - + @utils.add_log def write_support_matrix(self): def set_support_bit(row): @@ -390,9 +385,8 @@ class Variant_calling(Step): df_variant_count['support'] = df_variant_count.apply(set_support_bit, axis=1) support_mtx = coo_matrix( (df_variant_count.support, (df_variant_count.VID - 1, df_variant_count.CID - 1)) - ) + ) mmwrite(self.support_matrix_file, support_mtx) - def run(self): self.SplitNCigarReads() @@ -420,20 +414,20 @@ def get_opts_variant_calling(parser, sub_program): parser.add_argument("--genomeDir", help=HELP_DICT['genomeDir'], required=True) parser.add_argument( - "--vcf", + "--vcf", help="""VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level -and use these variants as input vcf.""", +and use these variants as input vcf.""", required=False ) if sub_program: parser.add_argument( "--bam", - help='Input BAM file from step `target_metrics`. ', + help='Input BAM file from step `target_metrics`. ', required=True ) parser.add_argument( - "--match_dir", - help=HELP_DICT['match_dir'], + "--match_dir", + help=HELP_DICT['match_dir'], required=True ) s_common(parser) diff --git a/celescope/tag/analysis_tag.py b/celescope/tag/analysis_tag.py index 4209efff..5c04c5fb 100755 --- a/celescope/tag/analysis_tag.py +++ b/celescope/tag/analysis_tag.py @@ -10,6 +10,7 @@ class Analysis_tag(Step, AnalysisMixin): Features - Combine scRNA-Seq clustering infromation with tag assignment. """ + def __init__(self, args, step_name): Step.__init__(self, args, step_name) AnalysisMixin.__init__(self, args) @@ -31,8 +32,9 @@ def get_opts_analysis_tag(parser, sub_program): parser.add_argument("--match_dir", help="Match celescope scRNA-Seq directory. ", required=True) parser = s_common(parser) + @utils.add_log def analysis_tag(args): step_name = 'analysis_tag' ana = Analysis_tag(args, step_name) - ana.run() \ No newline at end of file + ana.run() diff --git a/celescope/tag/count_tag.py b/celescope/tag/count_tag.py index 52cc42b9..6d9121a2 100755 --- a/celescope/tag/count_tag.py +++ b/celescope/tag/count_tag.py @@ -2,44 +2,43 @@ assign cell identity based on SNR and UMI_min """ +from celescope.__init__ import ROOT_PATH +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils +import pandas as pd +import numpy as np +import matplotlib.pyplot as plt import subprocess import matplotlib matplotlib.use('Agg') -import matplotlib.pyplot as plt -import numpy as np -import pandas as pd - -import celescope.tools.utils as utils -from celescope.tools.step import Step, s_common -from celescope.__init__ import ROOT_PATH def get_opts_count_tag(parser, sub_program): parser.add_argument( "--UMI_min", - help="Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.", + help="Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*.", default="auto" ) parser.add_argument( - "--dim", - help="Default=1. Tag dimentions. Usually we use 1-dimentional tag.", + "--dim", + help="Default=1. Tag dimentions. Usually we use 1-dimentional tag.", default=1 ) parser.add_argument( "--SNR_min", help="""Default='auto'. Minimum signal-to-noise ratio. -Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. """, +Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. """, default="auto" ) parser.add_argument("--combine_cluster", - help="Conbine cluster tsv file.", default=None) + help="Conbine cluster tsv file.", default=None) parser.add_argument( - "--coefficient", + "--coefficient", help="""Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)`. -Smaller `coefficient` will cause less *multiplet* in the tag assignment.""", +Smaller `coefficient` will cause less *multiplet* in the tag assignment.""", default=0.1 ) if sub_program: @@ -47,6 +46,7 @@ Smaller `coefficient` will cause less *multiplet* in the tag assignment.""", parser.add_argument("--match_dir", help="Match celescope scRNA-Seq directory.", required=True) s_common(parser) + def count_tag(args): step_name = "count_tag" @@ -85,11 +85,11 @@ class Count_tag(Step): # read self.df_read_count = pd.read_csv(self.read_count_file, sep="\t", index_col=0) - + match_dict = utils.parse_match_dir(self.match_dir) - self.match_barcode = match_dict['match_barcode'] + self.match_barcode = match_dict['match_barcode'] self.cell_total = match_dict['cell_total'] - self.tsne_file = match_dict['tsne_coord'] + self.tsne_file = match_dict['tsne_coord'] self.matrix_dir = match_dict['matrix_dir'] # init @@ -163,7 +163,6 @@ class Count_tag(Step): signal_tags_str = "_".join(signal_tags) return signal_tags_str - def write_and_plot(self, df, column_name, count_file, plot_file): df_count = df.groupby(["tag", column_name]).size().unstack() df_count.fillna(0, inplace=True) @@ -278,7 +277,7 @@ class Count_tag(Step): plot_file=self.combine_cluster_plot ) - sr_tag_count = df_UMI_cell["tag"].value_counts() # series(index:tag name, value:tag count) + sr_tag_count = df_UMI_cell["tag"].value_counts() # series(index:tag name, value:tag count) for tag_name in ("Undetermined", "Multiplet"): self.add_metric( name=tag_name + ' Cells', @@ -311,4 +310,3 @@ class Count_tag(Step): ) Count_tag.seurat_hashtag.logger.info(cmd) subprocess.check_call(cmd, shell=True) - diff --git a/celescope/tag/mapping_tag.py b/celescope/tag/mapping_tag.py index 6cd44423..290c0356 100755 --- a/celescope/tag/mapping_tag.py +++ b/celescope/tag/mapping_tag.py @@ -12,15 +12,15 @@ from celescope.tools.step import Step, s_common def get_opts_mapping_tag(parser, sub_program): parser.add_argument( - "--fq_pattern", + "--fq_pattern", help="""Required. R2 read pattern. The number after the letter represents the number of bases. `L` linker(common sequences) `C` tag barcode -""", +""", required=True ) parser.add_argument( - "--barcode_fasta", + "--barcode_fasta", help="""Required. Tag barcode fasta file. It will check the mismatches between tag barcode sequence in R2 reads with all tag barcode sequence in barcode_fasta. It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. @@ -35,11 +35,11 @@ AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA >tag_3 CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG ``` -""", +""", required=True, ) parser.add_argument( - "--linker_fasta", + "--linker_fasta", help="""Optional. If provided, it will check the mismatches between linker sequence in R2 reads with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid. """, @@ -134,8 +134,8 @@ class Mapping_tag(Step): if miss_length > 2: reads_unmapped_too_short += 1 continue - seq_barcode = seq_barcode + "A" * miss_length - + seq_barcode = seq_barcode + "A" * miss_length + # check linker if self.linker_length != 0: valid_linker = False @@ -145,7 +145,7 @@ class Mapping_tag(Step): break else: valid_linker = True - + if not valid_linker: reads_unmapped_invalid_iinker += 1 continue diff --git a/celescope/tag/multi_tag.py b/celescope/tag/multi_tag.py index 0686fc6b..23d0e9b1 100755 --- a/celescope/tag/multi_tag.py +++ b/celescope/tag/multi_tag.py @@ -25,7 +25,6 @@ class Multi_tag(Multi): ) self.process_cmd(cmd, step, sample, m=5, x=1) - def analysis_tag(self, sample): step = 'analysis_tag' tsne_tag_file = f'{self.outdir_dic[sample]["count_tag"]}/{sample}_tsne_tag.tsv' @@ -49,10 +48,10 @@ class Multi_tag(Multi): self.process_cmd(cmd, step, sample, m=5, x=1) - def main(): multi = Multi_tag(__ASSAY__) multi.run() + if __name__ == '__main__': main() diff --git a/celescope/tag/split_tag.py b/celescope/tag/split_tag.py index aa3bb587..53b2a732 100644 --- a/celescope/tag/split_tag.py +++ b/celescope/tag/split_tag.py @@ -12,6 +12,7 @@ import celescope.tools.utils as utils from celescope.tools.step import Step, s_common from celescope.__init__ import HELP_DICT + class Split_tag(Step): """ Features @@ -20,6 +21,7 @@ class Split_tag(Step): Output - `fastq/{tag}_{1,2}.fq` Fastq files of each tag. """ + def __init__(self, args, step_name): Step.__init__(self, args, step_name) @@ -45,7 +47,6 @@ class Split_tag(Step): self.tag_read_index_dict = defaultdict(set) - @utils.add_log def write_r2_fastq_files(self): read_num = 0 @@ -73,25 +74,26 @@ class Split_tag(Step): for tag in self.tag_read_index_dict: if read_index in self.tag_read_index_dict[tag]: self.r1_fastq_files_handle[tag].write(str(read) + '\n') - + for tag in self.r1_fastq_files_handle: self.r1_fastq_files_handle[tag].close() - @utils.add_log def run(self): if self.args.split_fastq: self.write_r2_fastq_files() self.write_r1_fastq_files() + def split_tag(args): step_name = "split_tag" runner = Split_tag(args, step_name) runner.run() + def get_opts_split_tag(parser, sub_program): parser.add_argument( - "--split_fastq", + "--split_fastq", help="If used, will split scRNA-Seq fastq file according to tag assignment.", action='store_true', ) @@ -100,4 +102,3 @@ def get_opts_split_tag(parser, sub_program): parser.add_argument("--match_dir", help=HELP_DICT['match_dir'], required=True) parser.add_argument("--R1_read", help='R1 read path.') s_common(parser) - diff --git a/celescope/tag/tests.py b/celescope/tag/tests.py index 446f17c2..42bedeb3 100644 --- a/celescope/tag/tests.py +++ b/celescope/tag/tests.py @@ -19,4 +19,4 @@ class Tests(unittest.TestCase): f'--matrix_10X {matrix_10X} ' ) print(cmd) - subprocess.check_call(cmd, shell=True) \ No newline at end of file + subprocess.check_call(cmd, shell=True) diff --git a/celescope/tcr_fl/__init__.py b/celescope/tcr_fl/__init__.py index 55c8fc6b..116f69bb 100755 --- a/celescope/tcr_fl/__init__.py +++ b/celescope/tcr_fl/__init__.py @@ -1,2 +1,2 @@ __STEPS__ = ['sample', 'barcode', 'cutadapt', 'split_fq', 'assemble'] -__ASSAY__ = 'tcr_fl' \ No newline at end of file +__ASSAY__ = 'tcr_fl' diff --git a/celescope/tcr_fl/assemble.py b/celescope/tcr_fl/assemble.py index 1fae3006..55aae9e4 100755 --- a/celescope/tcr_fl/assemble.py +++ b/celescope/tcr_fl/assemble.py @@ -29,6 +29,7 @@ def tracer(fq, outdir): ) subprocess.check_call(cmd, shell=True) + class Assemble_TCR(Step): def __init__(self, args, step): Step.__init__(self, args, step) diff --git a/celescope/tcr_fl/barcode_index.py b/celescope/tcr_fl/barcode_index.py index 1e05644a..8ab80a11 100755 --- a/celescope/tcr_fl/barcode_index.py +++ b/celescope/tcr_fl/barcode_index.py @@ -25,6 +25,3 @@ class Barcode_index(): write index-barcode to file """ self.df_index.to_csv(file_name, sep='\t') - - - \ No newline at end of file diff --git a/celescope/tcr_fl/multi_tcr_fl.py b/celescope/tcr_fl/multi_tcr_fl.py index d4e7e037..b9bcd780 100755 --- a/celescope/tcr_fl/multi_tcr_fl.py +++ b/celescope/tcr_fl/multi_tcr_fl.py @@ -40,5 +40,6 @@ def main(): multi = Multi_tcr_fl(__ASSAY__) multi.run() + if __name__ == '__main__': main() diff --git a/celescope/tcr_fl/split_fq.py b/celescope/tcr_fl/split_fq.py index c46c0538..b6b578c8 100755 --- a/celescope/tcr_fl/split_fq.py +++ b/celescope/tcr_fl/split_fq.py @@ -24,7 +24,7 @@ def get_nCell_barcodes(fq, nCell): for barcode in count_dict: barcode_dict[barcode] = len(count_dict[barcode]) barcodes = pd.DataFrame.from_dict(barcode_dict, orient='index').sort_values( - 0, ascending=False).iloc[0:nCell,].index + 0, ascending=False).iloc[0:nCell, ].index return barcodes @@ -46,7 +46,7 @@ def split_run(fq, fq_outdir, barcodes=None, nCell=None): if barcode in barcodes: cell_index = bi.index_dict[barcode] entry_dict[cell_index].append(entry) - + # write to file for cell_index in entry_dict: with open(f'{fq_outdir}/{cell_index}.fq', 'w') as f: @@ -69,10 +69,11 @@ def split_fq(args): fq_outdir = f'{args.outdir}/fastq' if nCell and nCell != 'None': nCell = int(nCell) - bi = split_run(args.fq, fq_outdir, barcodes, nCell) + bi = split_run(args.fq, fq_outdir, barcodes, nCell) index_file = f'{outdir}/{sample}_index.tsv' bi.df_index.to_csv(index_file, sep='\t') + def get_opts_split_fq(parser, sub_program): if sub_program: parser.add_argument('--outdir', help='output dir', required=True) @@ -81,4 +82,4 @@ def get_opts_split_fq(parser, sub_program): parser.add_argument('--assay', help='assay', required=True) parser.add_argument( "--match_dir", help="match scRNA-Seq dir") - parser.add_argument("--nCell", help="select top N cell") \ No newline at end of file + parser.add_argument("--nCell", help="select top N cell") diff --git a/celescope/tests/conftest.py b/celescope/tests/conftest.py index 683dc23a..25afa488 100644 --- a/celescope/tests/conftest.py +++ b/celescope/tests/conftest.py @@ -11,4 +11,4 @@ def pytest_generate_tests(metafunc): if 'assays' in metafunc.fixturenames and assays_value is not None: metafunc.parametrize("assays", [assays_value]) if 'test_dir' in metafunc.fixturenames and test_dir_value is not None: - metafunc.parametrize("test_dir", [test_dir_value]) \ No newline at end of file + metafunc.parametrize("test_dir", [test_dir_value]) diff --git a/celescope/tests/test_function.py b/celescope/tests/test_function.py index 8936c9aa..35cb83ed 100755 --- a/celescope/tests/test_function.py +++ b/celescope/tests/test_function.py @@ -8,8 +8,8 @@ from celescope.tools.step import Step class Tests(unittest.TestCase): def setUp(self): - pass - + pass + @unittest.skip("tested") def test_stat_to_metric(self): os.chdir('/SGRNJ01/RD_dir/pipeline_test/zhouyiqi/multi_tests/rna') @@ -28,4 +28,4 @@ class Tests(unittest.TestCase): print(obj.content_dict['metric']) def test_test(self): - assert 0 == 0 \ No newline at end of file + assert 0 == 0 diff --git a/celescope/tests/test_multi.py b/celescope/tests/test_multi.py index 68c9f23d..fe853db7 100755 --- a/celescope/tests/test_multi.py +++ b/celescope/tests/test_multi.py @@ -34,6 +34,7 @@ def run_single(assay, test_dir): print("*" * 20 + "success " + assay + "*" * 20) return f"{assay} success." + @utils.add_log def test_mutiple(assays, test_dir): """ @@ -57,4 +58,3 @@ def test_mutiple(assays, test_dir): for result in res_list: print(result) assert not any((string.find("failed") != -1 for string in res_list)) - diff --git a/celescope/tools/analysis_mixin.py b/celescope/tools/analysis_mixin.py index 85e92af5..03ccb2af 100755 --- a/celescope/tools/analysis_mixin.py +++ b/celescope/tools/analysis_mixin.py @@ -12,14 +12,14 @@ class AnalysisMixin(): """ mixin class for analysis child class must inherite Step class - """ + """ def __init__(self, args): if hasattr(args, "match_dir") and args.match_dir: self.match_dir = args.match_dir self.read_match_dir() else: - self.match_dir = args.outdir + "/../" # use self + self.match_dir = args.outdir + "/../" # use self @utils.add_log def seurat(self, matrix_file, save_rds, genomeDir): @@ -37,7 +37,6 @@ class AnalysisMixin(): AnalysisMixin.seurat.logger.info(cmd) subprocess.check_call(cmd, shell=True) - @utils.add_log def auto_assign(self, type_marker_tsv): rds = f'{self.outdir}/{self.sample}.rds' @@ -87,12 +86,12 @@ class AnalysisMixin(): return html code """ - avg_logfc_col = "avg_log2FC" # seurat 4 - if "avg_logFC" in self.marker_df.columns: # seurat 2.3.4 + avg_logfc_col = "avg_log2FC" # seurat 4 + if "avg_logFC" in self.marker_df.columns: # seurat 2.3.4 avg_logfc_col = "avg_logFC" marker_df = self.marker_df.loc[:, - ["cluster", "gene", avg_logfc_col, "pct.1", "pct.2", "p_val_adj"] - ] + ["cluster", "gene", avg_logfc_col, "pct.1", "pct.2", "p_val_adj"] + ] marker_df["cluster"] = marker_df["cluster"].apply(lambda x: f"cluster {x}") return marker_df diff --git a/celescope/tools/barcode.py b/celescope/tools/barcode.py index c03b8874..f7d03403 100755 --- a/celescope/tools/barcode.py +++ b/celescope/tools/barcode.py @@ -16,6 +16,7 @@ from celescope.tools.step import Step, s_common MIN_T = 10 + def seq_ranges(seq, pattern_dict): # get subseq with intervals in arr and concatenate return ''.join([seq[x[0]:x[1]]for x in pattern_dict]) @@ -222,7 +223,7 @@ class Chemistry(): else: linker_wrong_dict[linker] += 1 - percent_T4 = T4_n / self.nRead + percent_T4 = T4_n / self.nRead percent_L57C = L57C_n / self.nRead Chemistry.get_chemistry.logger.info(f'percent T4: {percent_T4}') Chemistry.get_chemistry.logger.info(f'percent L57C: {percent_L57C}') @@ -302,7 +303,7 @@ class Barcode(Step): self.lowQual = args.lowQual self.allowNoPolyT = args.allowNoPolyT self.allowNoLinker = args.allowNoLinker - self.nopolyT = args.nopolyT # true == output nopolyT reads + self.nopolyT = args.nopolyT # true == output nopolyT reads self.noLinker = args.noLinker # out file @@ -318,7 +319,6 @@ class Barcode(Step): self.noLinker_1 = f'{self.outdir}/noLinker_1.fq' self.noLinker_2 = f'{self.outdir}/noLinker_2.fq' - @utils.add_log def run(self): """ @@ -417,7 +417,7 @@ class Barcode(Step): '@%s\n%s\n+\n%s\n' % (header2, seq2, qual2)) continue - # lowQual filter + # lowQual filter C_U_quals_ascii = seq_ranges( qual1, pattern_dict['C'] + pattern_dict['U']) # C_U_quals_ord = [ord(q) - 33 for q in C_U_quals_ascii] @@ -440,7 +440,7 @@ class Barcode(Step): continue elif bool_corrected: self.linker_corrected_num += 1 - + # barcode filter seq_list = get_seq_list(seq1, pattern_dict, 'C') if bool_whitelist: @@ -498,11 +498,11 @@ class Barcode(Step): ''' with open(self.stat_file, 'w') as fh: stat_info = stat_info % (utils.format_number(self.total_num), utils.format_number(self.clean_num), - cal_percent(self.clean_num), BarcodesQ30, - UMIsQ30) + cal_percent(self.clean_num), BarcodesQ30, + UMIsQ30) stat_info = re.sub(r'^\s+', r'', stat_info, flags=re.M) fh.write(stat_info) - + self.clean_up() @@ -515,13 +515,13 @@ def barcode(args): def get_opts_barcode(parser, sub_program=True): parser.add_argument( - '--chemistry', + '--chemistry', help="""Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: - `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. - `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. - `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the same time.""", - choices=list(__PATTERN_DICT__.keys()), + choices=list(__PATTERN_DICT__.keys()), default='auto' ) parser.add_argument( @@ -538,14 +538,14 @@ same time.""", help='Cell barcode whitelist file path, one cell barcode per line.' ) parser.add_argument( - '--linker', + '--linker', help='Linker whitelist file path, one linker per line.' ) parser.add_argument( - '--lowQual', + '--lowQual', help='Default 0. Bases in cell barcode and UMI whose phred value are lower than \ lowQual will be regarded as low-quality bases.', - type=int, + type=int, default=0 ) parser.add_argument( @@ -560,23 +560,23 @@ lowQual will be regarded as low-quality bases.', action='store_true', ) parser.add_argument( - '--noLinker', + '--noLinker', help='Outputs R1 reads without correct linker.', action='store_true', ) parser.add_argument( - '--allowNoPolyT', - help="Allow valid reads without polyT.", + '--allowNoPolyT', + help="Allow valid reads without polyT.", action='store_true' ) parser.add_argument( - '--allowNoLinker', - help="Allow valid reads without correct linker.", + '--allowNoLinker', + help="Allow valid reads without correct linker.", action='store_true' ) parser.add_argument( - '--gzip', - help="Output gzipped fastq files.", + '--gzip', + help="Output gzipped fastq files.", action='store_true' ) if sub_program: diff --git a/celescope/tools/cellranger3/cell_calling_3.py b/celescope/tools/cellranger3/cell_calling_3.py index 54f15d14..a47f6513 100755 --- a/celescope/tools/cellranger3/cell_calling_3.py +++ b/celescope/tools/cellranger3/cell_calling_3.py @@ -56,7 +56,7 @@ def estimate_profile_sgt(matrix, barcode_indices, nz_feat): profile (np.array(float)): Estimated probabilities of length len(nz_feat). """ # Initial profile estimate - prof_mat = matrix[:,barcode_indices] + prof_mat = matrix[:, barcode_indices] profile = np.ravel(prof_mat[nz_feat, :].sum(axis=1)) zero_feat = np.flatnonzero(profile == 0) @@ -105,13 +105,13 @@ def find_nonambient_barcodes(raw_mat, recovered_cells, TBD """ NonAmbientBarcodeResult = namedtuple('NonAmbientBarcodeResult', - ['eval_bcs', # Candidate barcode indices (n) - 'log_likelihood',# Ambient log likelihoods (n) - 'pvalues', # pvalues (n) - 'pvalues_adj', # B-H adjusted pvalues (n) - 'is_nonambient', # Boolean nonambient calls (n) - ]) - + ['eval_bcs', # Candidate barcode indices (n) + 'log_likelihood', # Ambient log likelihoods (n) + 'pvalues', # pvalues (n) + 'pvalues_adj', # B-H adjusted pvalues (n) + 'is_nonambient', # Boolean nonambient calls (n) + ]) + # Estimate an ambient RNA profile umis_per_bc = np.squeeze(np.asarray(raw_mat.sum(axis=0))) # get the index of sorted umis_per_bc (ascending, bc_order[0] is the index of the smallest element in umis_per_bc) @@ -130,47 +130,48 @@ def find_nonambient_barcodes(raw_mat, recovered_cells, if len(use_bcs) > 0: try: - ## Get used "Gene" features (eval_features) - ## and the smoothed prob profile per "Gene" (ambient_profile_p) + # Get used "Gene" features (eval_features) + # and the smoothed prob profile per "Gene" (ambient_profile_p) eval_features, ambient_profile_p = est_background_profile_sgt(raw_mat.tocsc(), use_bcs) except cr_sgt.SimpleGoodTuringError as e: print(str(e)) else: eval_features = np.zeros(0, dtype=int) ambient_profile_p = np.zeros(0) - - ### Choose candidate cell barcodes - ### Regular ordmag filter - gg_filtered_indices, gg_filtered_metrics, _msg = cr_stats.filter_cellular_barcodes_ordmag(umis_per_bc, recovered_cells=recovered_cells) + + # Choose candidate cell barcodes + # Regular ordmag filter + gg_filtered_indices, gg_filtered_metrics, _msg = cr_stats.filter_cellular_barcodes_ordmag( + umis_per_bc, recovered_cells=recovered_cells) print('Cell-called barcodes metrics:') print('\n'.join(list(map(lambda x: '{}: {}'.format(*x), list(gg_filtered_metrics.items()))))) print('==============================') - + orig_cell_bc_set = set(gg_filtered_indices) orig_cells = np.flatnonzero(np.fromiter((bc in orig_cell_bc_set for bc in range(raw_mat.shape[1])), dtype=bool)) - ## No good incoming cell calls + # No good incoming cell calls if orig_cells.sum() == 0: print('Error: No original cells are selected!') return None, None, None - ## Look at non-cell barcodes above a minimum UMI count + # Look at non-cell barcodes above a minimum UMI count eval_bcs = np.ma.array(np.arange(raw_mat.shape[1])) eval_bcs[orig_cells] = ma.masked median_initial_umis = np.median(umis_per_bc[orig_cells]) - + min_umis = int(max(min_umis_nonambient, round(np.ceil(median_initial_umis * min_umi_frac_of_median)))) - + print('Median UMIs of initial cell calls: {}'.format(median_initial_umis)) print('Min UMIs: {}'.format(min_umis)) eval_bcs[umis_per_bc < min_umis] = ma.masked n_unmasked_bcs = len(eval_bcs) - eval_bcs.mask.sum() - ## Take the top N_CANDIDATE_BARCODES by UMI count, of barcodes that pass the above criteria - ## For evaluation of non-ambient bcs using background info estimated from SGT + # Take the top N_CANDIDATE_BARCODES by UMI count, of barcodes that pass the above criteria + # For evaluation of non-ambient bcs using background info estimated from SGT eval_bcs = np.argsort(ma.masked_array(umis_per_bc, mask=eval_bcs.mask))[:n_unmasked_bcs][-N_CANDIDATE_BARCODES:] if len(eval_bcs) == 0: @@ -193,7 +194,7 @@ def find_nonambient_barcodes(raw_mat, recovered_cells, obs_loglk = cr_stats.eval_multinomial_loglikelihoods(eval_mat, ambient_profile_p) # Simulate log likelihoods - distinct_ns, sim_loglk = cr_stats.simulate_multinomial_loglikelihoods(ambient_profile_p, umis_per_bc[eval_bcs], + distinct_ns, sim_loglk = cr_stats.simulate_multinomial_loglikelihoods(ambient_profile_p, umis_per_bc[eval_bcs], num_sims=10000, verbose=True) # Compute p-values @@ -205,10 +206,10 @@ def find_nonambient_barcodes(raw_mat, recovered_cells, print('Number of non-ambient barcodes from SGT:', len(eval_bcs[is_nonambient])) - ## Runxi's filtering + # Runxi's filtering print('Identify {} cell-associated barcodes'.format(len(orig_cells)+len(eval_bcs[is_nonambient]))) - ## of barcodes overlapped w/ the cellranger results + # of barcodes overlapped w/ the cellranger results filtered_bc_indices = np.concatenate((orig_cells, eval_bcs[is_nonambient]), axis=None) return filtered_bc_indices, gg_filtered_metrics, NonAmbientBarcodeResult( @@ -223,10 +224,10 @@ def find_nonambient_barcodes(raw_mat, recovered_cells, def cell_calling_3(all_matrix_10X_dir, expected_cell_num): raw_mat_path = os.path.join(all_matrix_10X_dir, MATRIX_FILE_NAME) - raw_mat = scipy.io.mmread(raw_mat_path) # scipy.sparse.coo.coo_matrix + raw_mat = scipy.io.mmread(raw_mat_path) # scipy.sparse.coo.coo_matrix raw_features_path = os.path.join(all_matrix_10X_dir, FEATURE_FILE_NAME) - raw_features_df = pd.read_csv(raw_features_path, sep='\t', error_bad_lines=False, names=['id','name','type']) + raw_features_df = pd.read_csv(raw_features_path, sep='\t', error_bad_lines=False, names=['id', 'name', 'type']) raw_features_df['id'].tolist() raw_features_df['name'].tolist() raw_features_df['type'].tolist() @@ -235,10 +236,10 @@ def cell_calling_3(all_matrix_10X_dir, expected_cell_num): raw_barcodes_df = pd.read_csv(raw_barcodes_path, sep='\t', error_bad_lines=False, names=['barcode']) raw_barcodes = np.array(raw_barcodes_df['barcode'].tolist()) - ### Run cell calling + # Run cell calling filtered_bc_indices, round_1_filtered_metrics, _non_ambient_barcode_result = find_nonambient_barcodes( - raw_mat=raw_mat,recovered_cells=expected_cell_num) - + raw_mat=raw_mat, recovered_cells=expected_cell_num) + cell_bc = raw_barcodes[filtered_bc_indices] initial_cell_num = round_1_filtered_metrics['filtered_bcs'] - return cell_bc, initial_cell_num \ No newline at end of file + return cell_bc, initial_cell_num diff --git a/celescope/tools/cellranger3/get_plot_elements.py b/celescope/tools/cellranger3/get_plot_elements.py index aaf5699a..85951fba 100755 --- a/celescope/tools/cellranger3/get_plot_elements.py +++ b/celescope/tools/cellranger3/get_plot_elements.py @@ -16,7 +16,7 @@ CHARTS_PLOTLY_MODEBAR_TRANSFORM_BUTTONS = [ 'zoomIn2d', 'zoomOut2d', 'autoScale2d', - #'resetScale2d' can't totally disable interaction, it seems-- keep reset option + # 'resetScale2d' can't totally disable interaction, it seems-- keep reset option ] CHARTS_PLOTLY_EXPORT_BUTTONS = [ @@ -38,11 +38,11 @@ CHARTS_PLOTLY_MOVABLE_CONFIG = { BC_RANK_PLOT_LINE_WIDTH = 3 # Gradient scheme used in the barcode rank plot BC_PLOT_COLORS = ['#dddddd', '#d1d8dc', '#c6d3dc', '#bacfdb', '#aecada', '#a3c5d9', '#97c0d9', '#8cbbd8', '#80b7d7', - '#74b2d7', '#6aadd6', '#66abd4', '#62a8d2', '#5ea5d1', '#59a2cf', '#559fce', '#519ccc', '#4d99ca', - '#4997c9', '#4594c7', '#4191c5', '#3d8dc4', '#3a8ac2', '#3787c0', '#3383be', '#3080bd', '#2c7cbb', - '#2979b9', '#2676b7', '#2272b6', '#1f6eb3', '#1d6ab0', '#1a65ac', '#1861a9', '#155ca6', '#1358a2', - '#10539f', '#0e4f9b', '#0b4a98', '#094695', '#09438f', '#0a4189', '#0c3f83', '#0d3d7c', '#0e3b76', - '#103970', '#11366a', '#123463', '#14325d', '#153057'] + '#74b2d7', '#6aadd6', '#66abd4', '#62a8d2', '#5ea5d1', '#59a2cf', '#559fce', '#519ccc', '#4d99ca', + '#4997c9', '#4594c7', '#4191c5', '#3d8dc4', '#3a8ac2', '#3787c0', '#3383be', '#3080bd', '#2c7cbb', + '#2979b9', '#2676b7', '#2272b6', '#1f6eb3', '#1d6ab0', '#1a65ac', '#1861a9', '#155ca6', '#1358a2', + '#10539f', '#0e4f9b', '#0b4a98', '#094695', '#09438f', '#0a4189', '#0c3f83', '#0d3d7c', '#0e3b76', + '#103970', '#11366a', '#123463', '#14325d', '#153057'] CHARTS = [ { @@ -50,7 +50,7 @@ CHARTS = [ 'title': 'Barcode Rank', 'width': 470, 'height': 313, - 'margin': { 'l': 60, 'r': 0, 't': 30, 'b': 40 }, + 'margin': {'l': 60, 'r': 0, 't': 30, 'b': 40}, 'hovermode': 'closest', 'xaxis': { 'title': 'Barcodes', diff --git a/celescope/tools/cellranger3/sgt.py b/celescope/tools/cellranger3/sgt.py index 0fa31b23..a99a06a4 100755 --- a/celescope/tools/cellranger3/sgt.py +++ b/celescope/tools/cellranger3/sgt.py @@ -23,7 +23,7 @@ def _averaging_transform(r, nr): dr = np.concatenate(( 0.5 * (d[1:] + d[0:-1]), np.array((d[-1],), dtype=float), - )) + )) return nr.astype(float)/dr @@ -52,9 +52,10 @@ def simple_good_turing(xr, xnr): slope, _intercept, _, _, _ = sp_stats.linregress(np.log(xr), np.log(xnrz)) if slope > -1: - raise SimpleGoodTuringError("The log-log slope is > -1 (%d); the SGT estimator is not applicable to these data." % slope) + raise SimpleGoodTuringError( + "The log-log slope is > -1 (%d); the SGT estimator is not applicable to these data." % slope) - xrst = _rstest(xr,slope) + xrst = _rstest(xr, slope) xrstrel = xrst/xr # Get traditional Good-Turing estimate @@ -72,7 +73,7 @@ def simple_good_turing(xr, xnr): useturing = True for r in range(len(xr)): if not useturing: - xrstcmbrel[r] = xrstrel[r] + xrstcmbrel[r] = xrstrel[r] else: if np.abs(xrstrel[r]-xrstarel[r]) * (1+r)/tursd[r] > 1.65: xrstcmbrel[r] = xrstarel[r] diff --git a/celescope/tools/cellranger3/stats.py b/celescope/tools/cellranger3/stats.py index b435844f..27f931c9 100755 --- a/celescope/tools/cellranger3/stats.py +++ b/celescope/tools/cellranger3/stats.py @@ -186,13 +186,13 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells): that likely represents a cell """ if recovered_cells is None: - ### Modified parameter, didn't use the default value + # Modified parameter, didn't use the default value recovered_cells = 3000 # recovered_cells = cr_constants.DEFAULT_RECOVERED_CELLS_PER_GEM_GROUP # 3000 - ## Initialize filter result metrics + # Initialize filter result metrics metrics = init_barcode_filter_result() - ## determine max # of cellular barcodes to consider + # determine max # of cellular barcodes to consider max_filtered_bcs = determine_max_filtered_bcs(recovered_cells) metrics['max_filtered_bcs'] = max_filtered_bcs @@ -202,15 +202,15 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells): return [], metrics, msg # baseline_bc_idx = int(round(float(recovered_cells) * (1 - cr_constants.ORDMAG_RECOVERED_CELLS_QUANTILE))) # Quantile=0.99 - baseline_bc_idx = int(round(float(recovered_cells) * (1 - 0.99))) # Quantile=0.99 + baseline_bc_idx = int(round(float(recovered_cells) * (1 - 0.99))) # Quantile=0.99 baseline_bc_idx = min(baseline_bc_idx, len(nonzero_bc_counts) - 1) assert baseline_bc_idx < max_filtered_bcs # Bootstrap sampling; run algo with many random samples of the data top_n_boot = np.array([ find_within_ordmag(np.random.choice(nonzero_bc_counts, len(nonzero_bc_counts)), baseline_bc_idx) - for i in range(100) # 100 -# for i in range(cr_constants.ORDMAG_NUM_BOOTSTRAP_SAMPLES) # 100 + for i in range(100) # 100 + # for i in range(cr_constants.ORDMAG_NUM_BOOTSTRAP_SAMPLES) # 100 ]) metrics.update(summarize_bootstrapped_top_n(top_n_boot)) @@ -224,9 +224,9 @@ def filter_cellular_barcodes_ordmag(bc_counts, recovered_cells): def filter_cellular_barcodes_fixed_cutoff(bc_counts, cutoff): nonzero_bcs = len(bc_counts[bc_counts > 0]) top_n = min(cutoff, nonzero_bcs) - ## np.argsort(bc_counts) => the indices that would sort an array - ## np.argsort(bc_counts)[0] => idx of the smallest element in array - ## np.argsort(bc_counts)[-1] => idx of the largest element in array + # np.argsort(bc_counts) => the indices that would sort an array + # np.argsort(bc_counts)[0] => idx of the smallest element in array + # np.argsort(bc_counts)[-1] => idx of the largest element in array top_bc_idx = np.sort(np.argsort(bc_counts)[::-1][:top_n]) metrics = { 'filtered_bcs': top_n, @@ -354,7 +354,7 @@ def eval_multinomial_loglikelihoods(matrix, profile_p, max_mem_gb=0.1): for chunk_start in range(0, num_bcs, bcs_per_chunk): chunk = slice(chunk_start, chunk_start+bcs_per_chunk) - matrix_chunk = matrix[:,chunk].transpose().toarray() + matrix_chunk = matrix[:, chunk].transpose().toarray() n = matrix_chunk.sum(1) loglk[chunk] = sp_stats.multinomial.logpmf(matrix_chunk, n, p=profile_p) return loglk @@ -415,7 +415,8 @@ def simulate_multinomial_loglikelihoods(profile_p, umis_per_bc, k += 1 if k >= n_sample_feature_block: # Amortize this operation - sampled_features = np.random.choice(len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True) + sampled_features = np.random.choice( + len(profile_p), size=n_sample_feature_block, p=profile_p, replace=True) k = 0 curr_counts[j] += 1 curr_loglk += log_profile_p[j] + np.log(float(n)/curr_counts[j]) @@ -450,6 +451,6 @@ def compute_ambient_pvalues(umis_per_bc, obs_loglk, sim_n, sim_loglk): pvalues = np.zeros(num_barcodes) for i in range(num_barcodes): - num_lower_loglk = np.sum(sim_loglk[sim_n_idx[i],:] < obs_loglk[i]) + num_lower_loglk = np.sum(sim_loglk[sim_n_idx[i], :] < obs_loglk[i]) pvalues[i] = float(1 + num_lower_loglk) / (1 + num_sims) - return pvalues \ No newline at end of file + return pvalues diff --git a/celescope/tools/consensus.py b/celescope/tools/consensus.py index 6492e7c9..0ae2eeba 100755 --- a/celescope/tools/consensus.py +++ b/celescope/tools/consensus.py @@ -18,6 +18,7 @@ class Consensus(Step): Output - `{sample}_consensus.fq` Consensus fastq. """ + def __init__(self, args, step_name): Step.__init__(self, args, step_name) @@ -33,8 +34,8 @@ class Consensus(Step): sort_fastq(self.args.fq, self.fq_tmp_file, self.outdir) n, total_ambiguous_base_n, length_list = sorted_dumb_consensus( - fq=self.fq_tmp_file, - outfile=self.consensus_fq, + fq=self.fq_tmp_file, + outfile=self.consensus_fq, threshold=self.args.threshold ) @@ -54,7 +55,6 @@ class Consensus(Step): self.clean_up() - @utils.add_log def sort_fastq(fq, fq_tmp_file, outdir): tmp_dir = f'{outdir}/tmp' @@ -96,7 +96,7 @@ def sorted_dumb_consensus(fq, outfile, threshold): sorted_dumb_consensus.logger.info(f'{n_umi} UMI done.') total_ambiguous_base_n += ambiguous_base_n length_list.append(con_len) - + out_h.close() return n_umi, total_ambiguous_base_n, length_list @@ -159,7 +159,7 @@ def get_read_length(read_list, threshold=0.5): length = max length with read fraction >= threshold elements of read_list: [entry.sequence,entry.quality] ''' - + n_read = len(read_list) length_dict = defaultdict(int) for read in read_list: @@ -169,11 +169,12 @@ def get_read_length(read_list, threshold=0.5): length_dict[length] = length_dict[length] / n_read fraction = 0 - for length in sorted(length_dict.keys(),reverse=True): + for length in sorted(length_dict.keys(), reverse=True): fraction += length_dict[length] if fraction >= threshold: return length + @utils.add_log def consensus(args): @@ -181,9 +182,10 @@ def consensus(args): consensus_obj = Consensus(args, step_name) consensus_obj.run() + def get_opts_consensus(parser, sub_program): parser.add_argument("--threshold", help='Default 0.5. Valid base threshold. ', type=float, default=0.5) parser.add_argument("--not_consensus", help="Skip the consensus step. ", action='store_true') if sub_program: parser.add_argument("--fq", help="Required. Fastq file.", required=True) - s_common(parser) \ No newline at end of file + s_common(parser) diff --git a/celescope/tools/count.py b/celescope/tools/count.py index b532f646..b3da47f1 100755 --- a/celescope/tools/count.py +++ b/celescope/tools/count.py @@ -71,6 +71,7 @@ class Count(Step): """ + def __init__(self, args, step): Step.__init__(self, args, step) self.force_cell_num = args.force_cell_num @@ -186,7 +187,7 @@ class Count(Step): with open(self.count_detail_file, 'wt') as fh1: fh1.write('\t'.join(['Barcode', 'geneID', 'UMI', 'count']) + '\n') - def keyfunc(x): + def keyfunc(x): return x.query_name.split('_', 1)[0] for _, g in groupby(samfile, keyfunc): gene_umi_dict = defaultdict(lambda: defaultdict(int)) @@ -477,16 +478,16 @@ def get_opts_count(parser, sub_program): parser.add_argument('--genomeDir', help='Required. Genome directory.') parser.add_argument('--expected_cell_num', help='Default `3000`. Expected cell number.', default=3000) parser.add_argument( - '--cell_calling_method', + '--cell_calling_method', help='Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`.', - choices=['auto', 'cellranger3', 'inflection', ], + choices=['auto', 'cellranger3', 'inflection', ], default='auto', ) if sub_program: parser = s_common(parser) parser.add_argument('--bam', help='Required. BAM file from featureCounts.', required=True) parser.add_argument( - '--force_cell_num', - help='Default `None`. Force the cell number to be this value ± 10%.', + '--force_cell_num', + help='Default `None`. Force the cell number to be this value ± 10%.', default=None ) diff --git a/celescope/tools/cutadapt.py b/celescope/tools/cutadapt.py index 82fe66f8..cb596d93 100755 --- a/celescope/tools/cutadapt.py +++ b/celescope/tools/cutadapt.py @@ -15,8 +15,8 @@ class Cutadapt(Step): """ Features - Trim adapters in R2 reads with cutadapt. Default adapters includes: - - polyT=A{18}, 18 A bases. - - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. + - polyT=A{18}, 18 A bases. + - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. Output - `cutadapt.log` Cutadapt output log file. @@ -38,7 +38,6 @@ class Cutadapt(Step): self.out_fq2 = f'{self.outdir}/{self.sample}_clean_2.fq{suffix}' self.cutadapt_log_file = f'{self.outdir}/cutadapt.log' - @staticmethod def read_adapter_fasta(adapter_fasta): ''' @@ -100,7 +99,7 @@ class Cutadapt(Step): Cutadapt.run.logger.info(cmd) # need encoding argument to return str results = subprocess.run( - cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, + cmd, stderr=subprocess.STDOUT, stdout=subprocess.PIPE, encoding='utf-8', check=True, shell=True ) cutadapt_log = results.stdout @@ -110,7 +109,7 @@ class Cutadapt(Step): self.clean_up() -@utils.add_log +@utils.add_log def cutadapt(args): step_name = "cutadapt" @@ -122,7 +121,7 @@ def get_opts_cutadapt(parser, sub_program): parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.') parser.add_argument( '--minimum_length', - help='Default `20`. Discard processed reads that are shorter than LENGTH.', + help='Default `20`. Discard processed reads that are shorter than LENGTH.', default=20 ) parser.add_argument( @@ -132,7 +131,7 @@ Some Illumina instruments use a two-color chemistry to encode the four bases. This includes the NextSeq and the NovaSeq. In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. However, dark cycles also occur when sequencing “falls off” the end of the fragment. -The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.""", +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end.""", default=20, ) parser.add_argument( @@ -141,12 +140,12 @@ The read then contains a run of high-quality, but incorrect “G” calls at its short matches can occur by chance, leading to erroneously trimmed bases. For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. To reduce the number of falsely trimmed bases, the alignment algorithm requires that -at least {overlap} bases match between adapter and read. """, +at least {overlap} bases match between adapter and read. """, default=10 ) parser.add_argument( - '--insert', - help="Default `150`. Read2 insert length.", + '--insert', + help="Default `150`. Read2 insert length.", default=150 ) if sub_program: @@ -154,5 +153,3 @@ at least {overlap} bases match between adapter and read. """, parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true') parser = s_common(parser) return parser - - diff --git a/celescope/tools/debug.py b/celescope/tools/debug.py index add4f065..c4d05edd 100755 --- a/celescope/tools/debug.py +++ b/celescope/tools/debug.py @@ -54,7 +54,6 @@ class Debug(): with open('fastqc.sh', 'wt') as f: f.write(cmd) - def run(self): self.run_subsample() self.run_STAR() @@ -64,5 +63,3 @@ class Debug(): if __name__ == '__main__': de = Debug() de.run() - - \ No newline at end of file diff --git a/celescope/tools/featureCounts.py b/celescope/tools/featureCounts.py index fc96eb0c..878a784a 100755 --- a/celescope/tools/featureCounts.py +++ b/celescope/tools/featureCounts.py @@ -82,7 +82,7 @@ class FeatureCounts(Step): 'featureCounts ' '-s 1 ' f'-a {self.gtf} ' - f'-o {self.out_prefix} ' # not bam + f'-o {self.out_prefix} ' # not bam '-R BAM ' f'-T {self.thread} ' f'-t {self.args.gtf_type} ' @@ -90,7 +90,7 @@ class FeatureCounts(Step): ) FeatureCounts.run_featureCounts.logger.info(cmd) subprocess.check_call(cmd, shell=True) - + @add_log def name_sort_bam(self): cmd = ( @@ -102,7 +102,6 @@ class FeatureCounts(Step): FeatureCounts.name_sort_bam.logger.info(cmd) subprocess.check_call(cmd, shell=True) - def run(self): self.run_featureCounts() add_tag(self.featureCounts_bam, self.gtf) @@ -153,4 +152,3 @@ def get_opts_featureCounts(parser, sub_program): parser.add_argument('--input', help='Required. BAM file path.', required=True) parser = s_common(parser) return parser - diff --git a/celescope/tools/mkref.py b/celescope/tools/mkref.py index e633995b..9e243459 100755 --- a/celescope/tools/mkref.py +++ b/celescope/tools/mkref.py @@ -29,7 +29,7 @@ class Mkref(): # out file self.config_file = f'{self.genomeDir}/{GENOME_CONFIG}' - + @abc.abstractmethod def run(self): return diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index 9fed21b8..c1bbd882 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -23,7 +23,7 @@ class Multi(): self.last_step = '' self.args = None self.steps_not_run = ['mkref'] - + # remove for step in self.steps_not_run: if step in self.__STEPS__: @@ -46,9 +46,9 @@ class Multi(): def common_args(self): readme = f'{self.__ASSAY__} multi-samples' - parser = argparse.ArgumentParser(readme, - formatter_class=ArgFormatter, - conflict_handler='resolve') + parser = argparse.ArgumentParser(readme, + formatter_class=ArgFormatter, + conflict_handler='resolve') parser.add_argument('--mod', help='mod, sjm or shell', choices=['sjm', 'shell'], default='sjm') parser.add_argument( '--mapfile', @@ -100,7 +100,6 @@ class Multi(): fq_dict[sample_name] = [[fq1], [fq2]] col4_dict[sample_name] = col4 - for sample_name in fq_dict: fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0]) fq_dict[sample_name][1] = ",".join(fq_dict[sample_name][1]) @@ -147,7 +146,7 @@ class Multi(): step_outdir = f"{self.args.outdir}/{sample}/{index:02d}.{step}" self.outdir_dic[sample].update({step: step_outdir}) index += 1 - + def generate_cmd(self, cmd, step, sample, m=1, x=1): if sample: sample = "_" + sample @@ -199,7 +198,7 @@ job_end if args_dict[arg]: matches = [' ', '-'] arg_string = str(args_dict[arg]) - if any(char in arg_string for char in matches): # need quote + if any(char in arg_string for char in matches): # need quote cmd_line += f'--{arg} "{arg_string}" ' else: cmd_line += f'--{arg} {arg_string} ' @@ -215,7 +214,7 @@ job_end f'--fq1 {arr[0]} ' ) self.process_cmd(cmd, step, sample, m=1, x=1) - + def barcode(self, sample): step = "barcode" arr = self.fq_dict[sample] @@ -304,7 +303,7 @@ job_end ) from attr_not_exist method_to_call(sample) - def merge_report(self): + def merge_report(self): step = "merge_report" steps_str = ",".join(self.__STEPS__) samples = ','.join(self.fq_dict.keys()) diff --git a/celescope/tools/report.py b/celescope/tools/report.py index 0aaf45ba..08c4a7cc 100755 --- a/celescope/tools/report.py +++ b/celescope/tools/report.py @@ -32,7 +32,6 @@ class reporter: def get_report(self): - json_file = self.outdir + '/.data.json' if not os.path.exists(json_file): data = {} @@ -58,7 +57,7 @@ class reporter: if isinstance(self.df, pd.DataFrame): df = self.df.fillna(value="") - data[self.name + '_table'] = df.values.tolist() + data[self.name + '_table'] = df.values.tolist() if self.table_header: data[self.name + '_table_header'] = self.table_header @@ -74,4 +73,3 @@ class reporter: with open(json_file, 'w') as fh: json.dump(data, fh, indent=4) - diff --git a/celescope/tools/sample.py b/celescope/tools/sample.py index 48422696..f8c91fb6 100755 --- a/celescope/tools/sample.py +++ b/celescope/tools/sample.py @@ -11,7 +11,7 @@ from celescope.tools.step import Step, s_common @utils.add_log def sample(args): - + step_name = "sample" step = Step(args, step_name) @@ -30,7 +30,6 @@ def sample(args): chemistry = ",".join(set(chemistry)) else: chemistry = args.chemistry - if not os.path.exists(outdir): os.system('mkdir -p %s' % outdir) @@ -38,7 +37,7 @@ def sample(args): stat = pd.DataFrame({ "item": ["Sample ID", "Assay", "Chemistry", "Software Version"], "count": [sample_name, assay_description, chemistry, version], - }, + }, columns=["item", "count"] ) stat_file = outdir + "/stat.txt" @@ -55,4 +54,3 @@ def get_opts_sample(parser, sub_program): parser.add_argument('--fq1', help='read1 fq file') parser.add_argument('--chemistry', choices=list(__PATTERN_DICT__.keys()), help='chemistry version', default='auto') return parser - diff --git a/celescope/tools/star_mixin.py b/celescope/tools/star_mixin.py index 3694959d..a5cfa9cf 100755 --- a/celescope/tools/star_mixin.py +++ b/celescope/tools/star_mixin.py @@ -10,6 +10,7 @@ class StarMixin(): """ Mixin class for STAR """ + def __init__(self, args, add_prefix=None): self.fq = args.fq self.genomeDir = args.genomeDir @@ -33,7 +34,7 @@ class StarMixin(): self.STAR_map_log = f'{self.outPrefix}Log.final.out' self.unsort_STAR_bam = f'{self.outPrefix}Aligned.out.bam' self.STAR_bam = f'{self.outPrefix}Aligned.sortedByCoord.out.bam' - + @utils.add_log def STAR(self): cmd = [ @@ -43,7 +44,7 @@ class StarMixin(): '--readFilesIn', self.fq, '--outFilterMultimapNmax', str(self.multi_max), '--outFileNamePrefix', self.outPrefix, - '--outSAMtype', 'BAM', 'Unsorted', # controls sort by Coordinate or not + '--outSAMtype', 'BAM', 'Unsorted', # controls sort by Coordinate or not '--outFilterMatchNmin', str(self.outFilterMatchNmin) ] if self.out_unmapped: @@ -72,7 +73,7 @@ class StarMixin(): @utils.add_log def index_bam(self): - utils.index_bam(self.STAR_bam) + utils.index_bam(self.STAR_bam) def get_star_metrics(self): """ @@ -113,29 +114,29 @@ class StarMixin(): def get_opts_star_mixin(parser, sub_program): parser.add_argument( - '--genomeDir', + '--genomeDir', help='Required. Genome directory.' ) parser.add_argument( - '--outFilterMatchNmin', + '--outFilterMatchNmin', help="""Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value.""", +is higher than or equal to this value.""", default=0 ) parser.add_argument( - '--out_unmapped', - help='Output unmapped reads', + '--out_unmapped', + help='Output unmapped reads', action='store_true' ) parser.add_argument('--STAR_param', help='Other STAR parameters', default="") parser.add_argument( - '--outFilterMultimapNmax', - help='Default `1`. How many places are allowed to match a read at most.', + '--outFilterMultimapNmax', + help='Default `1`. How many places are allowed to match a read at most.', default=1 ) parser.add_argument( - '--starMem', - help='Default `30`. Maximum memory that STAR can use.', + '--starMem', + help='Default `30`. Maximum memory that STAR can use.', default=30 ) if sub_program: diff --git a/celescope/tools/step.py b/celescope/tools/step.py index 0c759de3..39e75b59 100755 --- a/celescope/tools/step.py +++ b/celescope/tools/step.py @@ -14,6 +14,7 @@ from celescope.tools.utils import add_log Metric = namedtuple("Metric", "name value total fraction") + def s_common(parser): """subparser common arguments """ @@ -21,7 +22,8 @@ def s_common(parser): parser.add_argument('--assay', help='Assay name.', required=True) parser.add_argument('--sample', help='Sample name.', required=True) parser.add_argument('--thread', help='Thread to use.', default=4) - parser.add_argument('--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true') + parser.add_argument( + '--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true') return parser @@ -29,6 +31,7 @@ class Step: """ Step class """ + def __init__(self, args, step_name): self.step_name = step_name self.args = args @@ -37,7 +40,7 @@ class Step: self.assay = args.assay self.thread = int(args.thread) self.debug = args.debug - # set + # set self.out_prefix = f'{self.outdir}/{self.sample}' # important! make outdir before path_dict because path_dict use relative path. @@ -113,7 +116,6 @@ class Step: line += f'{fraction}%' stat_handle.write(line + '\n') - def dump_content(self, slot): '''dump content to json file ''' @@ -146,7 +148,7 @@ class Step: metrics = dict() for metric_name, string in dic.items(): bool_fraction = False - bool_value = False + bool_value = False if '%' in string: bool_fraction = True if "(" in string: @@ -154,17 +156,17 @@ class Step: chars = [',', '%', ')'] for character in chars: string = string.replace(character, '') - + if bool_fraction: - if bool_value: # case 2 + if bool_value: # case 2 value, fraction = string.split('(') fraction = round(float(fraction) / 100, 4) metrics[metric_name] = int(value) metrics[metric_name + ' Fraction'] = fraction - else: # case 3 + else: # case 3 fraction = round(float(string) / 100, 4) metrics[metric_name] = fraction - else: # case 1 + else: # case 1 value = string if '.' in string: try: @@ -223,5 +225,3 @@ class Step: @abc.abstractmethod def run(self): return - - diff --git a/celescope/tools/target_metrics.py b/celescope/tools/target_metrics.py index 5fe8aa39..b43609ba 100755 --- a/celescope/tools/target_metrics.py +++ b/celescope/tools/target_metrics.py @@ -57,7 +57,7 @@ class Target_metrics(Step): self.count_dict[barcode][gene_name][UMI] += 1 @utils.add_log - def parse_count_dict_add_metrics(self): + def parse_count_dict_add_metrics(self): total_UMIs = 0 enriched_UMIs = 0 enriched_UMIs_in_cells = 0 @@ -80,7 +80,7 @@ class Target_metrics(Step): self.add_metric( name="Total UMIs", value=total_UMIs, - ) + ) self.add_metric( name="Enriched UMIs", @@ -122,4 +122,3 @@ def get_opts_target_metrics(parser, sub_program): parser.add_argument("--bam", help='Input bam file', required=True) parser.add_argument('--match_dir', help=HELP_DICT['match_dir'], required=True) parser = s_common(parser) - diff --git a/celescope/tools/tests.py b/celescope/tools/tests.py index 1bba4dff..ca0d5c5f 100755 --- a/celescope/tools/tests.py +++ b/celescope/tools/tests.py @@ -10,6 +10,7 @@ class Tests(unittest.TestCase): """ Run this test under a temp folder as it will generate some files. """ + def setUp(self): pass @@ -33,11 +34,11 @@ class Tests(unittest.TestCase): step.clean_up() def test_get_read_length(self): - read_list = [['AAAA','FFFF'],['TTT','FFF'],['CCC','FFF'],['GGGGGGG','FFFFFFF']] + read_list = [['AAAA', 'FFFF'], ['TTT', 'FFF'], ['CCC', 'FFF'], ['GGGGGGG', 'FFFFFFF']] assert get_read_length(read_list, 0.5) == 4 def test_dumb_consensus(self): - read_list = [('AAAA','FFFF'),('TTT','FF;'),('CCC','FFF'),('GGGGGGG','FFFFFFF')] + read_list = [('AAAA', 'FFFF'), ('TTT', 'FF;'), ('CCC', 'FFF'), ('GGGGGGG', 'FFFFFFF')] consensus_seq, consensus_qual, _ambiguous_base_n, _con_len = dumb_consensus(read_list, 0.5) print(consensus_qual) assert consensus_seq == 'NNNA' @@ -53,10 +54,10 @@ class Tests(unittest.TestCase): "ccccc2": 199, } n_corrected_umi, n_corrected_read = Count.correct_umi(dic) - sorted_dic = sorted(dic.items(), key=lambda x:x[1]) + sorted_dic = sorted(dic.items(), key=lambda x: x[1]) assert sorted_dic == [('ccccc1', 20), ('apple2', 32), ('bears3', 115), ('ccccc2', 199)] assert n_corrected_umi == 3 - assert n_corrected_read == 2 + 5 + 10 + assert n_corrected_read == 2 + 5 + 10 if __name__ == '__main__': diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index a74388ea..2c012d14 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -48,7 +48,7 @@ def add_log(func): @wraps(func) def wrapper(*args, **kwargs): if args and hasattr(args[0], 'debug') and args[0].debug: - logger.setLevel(10) # debug + logger.setLevel(10) # debug logger.info('start...') start = time.time() @@ -63,10 +63,10 @@ def add_log(func): def using(point=""): - usage=resource.getrusage(resource.RUSAGE_SELF) + usage = resource.getrusage(resource.RUSAGE_SELF) return '''%s: usertime=%s systime=%s mem=%s mb - '''%(point,usage[0],usage[1], - usage[2]/1024.0) + ''' % (point, usage[0], usage[1], + usage[2]/1024.0) def add_mem(func): @@ -185,6 +185,7 @@ def generic_open(file_name, *args, **kwargs): file_obj = open(file_name, *args, **kwargs) return file_obj + @add_log def get_id_name_dict(gtf_file): """ @@ -213,23 +214,23 @@ def get_id_name_dict(gtf_file): gene_id = gene_id_pattern.findall(attributes)[-1] gene_names = gene_name_pattern.findall(attributes) if not gene_names: - gene_name = gene_id + gene_name = gene_id else: gene_name = gene_names[-1] c[gene_name] += 1 if c[gene_name] > 1: if gene_id in id_name: assert id_name[gene_id] == gene_name, ( - 'one gene_id with multiple gene_name ' - f'gene_id: {gene_id}, ' - f'gene_name this line: {gene_name}' - f'gene_name previous line: {id_name[gene_id]}' - ) + 'one gene_id with multiple gene_name ' + f'gene_id: {gene_id}, ' + f'gene_name this line: {gene_name}' + f'gene_name previous line: {id_name[gene_id]}' + ) get_id_name_dict.logger.warning( - 'duplicated (gene_id, gene_name)' - f'gene_id: {gene_id}, ' - f'gene_name {gene_name}' - ) + 'duplicated (gene_id, gene_name)' + f'gene_id: {gene_id}, ' + f'gene_name {gene_name}' + ) c[gene_name] -= 1 else: gene_name = f'{gene_name}_{c[gene_name]}' @@ -239,8 +240,8 @@ def get_id_name_dict(gtf_file): @add_log def process_read( - read2_file, pattern_dict, barcode_dict, linker_dict, - barcode_length, linker_length): + read2_file, pattern_dict, barcode_dict, linker_dict, + barcode_length, linker_length): # if valid, return (True) metrics = defaultdict(int) @@ -270,8 +271,8 @@ def process_read( if miss_length > 2: metrics['Reads Unmapped too Short'] += 1 continue - seq_barcode = seq_barcode + "A" * miss_length - + seq_barcode = seq_barcode + "A" * miss_length + # check linker if linker_length != 0: valid_linker = False @@ -281,7 +282,7 @@ def process_read( break else: valid_linker = True - + if not valid_linker: metrics['Reads Unmapped Invalid Linker'] += 1 continue @@ -371,7 +372,7 @@ def gen_stat(df, stat_file): value = f'{format_number(count)}({round(percent * 100, 2)}%)' return value - df.loc[:,'value'] = df.loc[:,'count'] + df.loc[:, 'value'] = df.loc[:, 'count'] df.loc[~df['total_count'].isna(), 'value'] = df.loc[~df['total_count'].isna(), :].apply( add_percent, axis=1 ) @@ -387,9 +388,9 @@ def get_read(library_id, library_path, read='1'): fq_list = ['fq', 'fastq'] suffix_list = ["", ".gz"] read_pattern_list = [ - f'{library_path}/*{library_id}*{read}.{fq_str}{suffix}' - for read in read1_list - for fq_str in fq_list + f'{library_path}/*{library_id}*{read}.{fq_str}{suffix}' + for read in read1_list + for fq_str in fq_list for suffix in suffix_list ] fq_list = [glob.glob(read1_pattern) for read1_pattern in read_pattern_list] @@ -469,8 +470,8 @@ job_end def merge_report( - fq_dict, steps, last_step, sjm_cmd, - sjm_order, logdir, conda, outdir, rm_files): + fq_dict, steps, last_step, sjm_cmd, + sjm_order, logdir, conda, outdir, rm_files): step = "merge_report" steps_str = ",".join(steps) samples = ','.join(fq_dict.keys()) @@ -577,7 +578,7 @@ def report_prepare(outdir, **kwargs): json.dump(data, fh) -def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')): +def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID', 'CID')): vcf = pysam.VariantFile(vcf_file) df = pd.DataFrame(columns=[col.capitalize() for col in cols] + infos) rec_dict = {} @@ -587,7 +588,7 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')): rec_dict[col.capitalize()] = getattr(rec, col) if col == 'alleles': rec_dict['Alleles'] = '-'.join(rec_dict['Alleles']) - + for info in infos: rec_dict[info] = rec.info[info] @@ -597,12 +598,12 @@ def parse_vcf(vcf_file, cols=('chrom', 'pos', 'alleles'), infos=('VID','CID')): rec_dict['GT'] = '/'.join(rec_dict['GT']) ''' - df = df.append(pd.Series(rec_dict),ignore_index=True) + df = df.append(pd.Series(rec_dict), ignore_index=True) return df def parse_annovar(annovar_file): - df = pd.DataFrame(columns=['Gene','mRNA', 'Protein', 'COSMIC']) + df = pd.DataFrame(columns=['Gene', 'mRNA', 'Protein', 'COSMIC']) with open(annovar_file, 'rt') as f: index = 0 for line in f: @@ -632,7 +633,7 @@ def parse_annovar(annovar_file): if change_attr.startswith('p.'): protein = change_attr.strip('p.') if not (mRNA, protein) in change_list: - change_list.append((mRNA, protein)) + change_list.append((mRNA, protein)) combine = [','.join(item) for item in list(zip(*change_list))] mRNA = combine[0] protein = combine[1] @@ -763,6 +764,7 @@ def find_step_module(assay, step): return step_module + def find_step_module_with_folder(assay, step): init_module = find_assay_init(assay) folder = "" @@ -792,4 +794,4 @@ def sort_bam(input_bam, output_bam, threads=1): def index_bam(input_bam): cmd = f"samtools index {input_bam}" - subprocess.check_call(cmd, shell=True) \ No newline at end of file + subprocess.check_call(cmd, shell=True) diff --git a/celescope/vdj/__init__.py b/celescope/vdj/__init__.py index 9038708c..fd57f4eb 100755 --- a/celescope/vdj/__init__.py +++ b/celescope/vdj/__init__.py @@ -3,4 +3,4 @@ __ASSAY__ = 'vdj' CHAINS = { "TCR": ["TRA", "TRB"], "BCR": ["IGH", "IGL", "IGK"], -} \ No newline at end of file +} diff --git a/celescope/vdj/mapping_vdj.py b/celescope/vdj/mapping_vdj.py index dd811695..45776487 100755 --- a/celescope/vdj/mapping_vdj.py +++ b/celescope/vdj/mapping_vdj.py @@ -63,7 +63,7 @@ class Mapping_vdj(Step): 'mixcr exportAlignments ' f'{self.read2_vdjca} {self.alignments} ' '-readIds --force-overwrite -vGene -dGene -jGene -cGene ' - '-nFeature CDR3 -aaFeature CDR3 ' + '-nFeature CDR3 -aaFeature CDR3 ' ) Mapping_vdj.run_mixcr.logger.info(cmd) @@ -71,7 +71,7 @@ class Mapping_vdj(Step): @utils.add_log def mixcr_summary(self, total_read, df_align): - + align_read = df_align.shape[0] self.add_metric( name=f"{self.read_type} Mapped to Any VDJ Gene", @@ -198,26 +198,25 @@ class Mapping_vdj(Step): @utils.add_log def mapping_vdj(args): - # TODO + # TODO # add TCR or BCR prefix to distinguish them in html report summary; should improve step_name = f"{args.type}_mapping_vdj" mapping_vdj_obj = Mapping_vdj(args, step_name) mapping_vdj_obj.run() - def get_opts_mapping_vdj(parser, sub_program): parser.add_argument("--type", help='TCR or BCR', required=True) parser.add_argument( - '--species', - choices=['hs', 'mmu'], - help='Default `hs`. `hs`(human) or `mmu`(mouse). ', + '--species', + choices=['hs', 'mmu'], + help='Default `hs`. `hs`(human) or `mmu`(mouse). ', default='hs' ) parser.add_argument("--not_consensus", action='store_true', help="Input fastq is not consensused.") if sub_program: parser.add_argument( - "--fq", + "--fq", help="Required. Input fastq file.", required=True, ) diff --git a/celescope/vdj/multi_vdj.py b/celescope/vdj/multi_vdj.py index 5d2a4415..d6257559 100755 --- a/celescope/vdj/multi_vdj.py +++ b/celescope/vdj/multi_vdj.py @@ -17,7 +17,6 @@ class Multi_vdj(Multi): ) self.process_cmd(cmd, step, sample, m=15, x=self.args.thread) - def count_vdj(self, sample): # count_vdj step = 'count_vdj' @@ -33,11 +32,10 @@ class Multi_vdj(Multi): self.process_cmd(cmd, step, sample, m=8, x=self.args.thread) - def main(): multi = Multi_vdj(__ASSAY__) multi.run() + if __name__ == '__main__': main() - -- Gitee From 6acf5b48c704875b7e84b99a6593bf35e49dbe91 Mon Sep 17 00:00:00 2001 From: pigraul Date: Thu, 24 Jun 2021 09:22:30 +0800 Subject: [PATCH 74/96] add Dynaseq --- celescope/__init__.py | 1 + celescope/dynaseq/Generate_T_C_matrix.R | 51 +++ celescope/dynaseq/__init__.py | 23 ++ celescope/dynaseq/analysis.py | 104 ++++++ celescope/dynaseq/conversion.py | 267 ++++++++++++++ celescope/dynaseq/multi_dynaseq.py | 63 ++++ celescope/dynaseq/replace_tsne.py | 177 ++++++++++ celescope/dynaseq/replacement.py | 331 ++++++++++++++++++ celescope/dynaseq/star.py | 163 +++++++++ celescope/dynaseq/subsitution.py | 208 +++++++++++ celescope/templates/html/dynaseq/base.html | 156 +++++++++ .../html/dynaseq/replace_tsne_summary.html | 54 +++ .../html/dynaseq/replacement_summary.html | 3 + .../html/dynaseq/subsitution_summary.html | 13 + celescope/tools/multi.py | 10 +- celescope/tools/utils.py | 7 +- docs/dynaseq/analysis.md | 51 +++ docs/dynaseq/conversion.md | 26 ++ docs/dynaseq/replace_tsne.md | 30 ++ docs/dynaseq/replacement.md | 36 ++ docs/dynaseq/star.md | 56 +++ docs/dynaseq/subsitution.md | 20 ++ 22 files changed, 1844 insertions(+), 6 deletions(-) create mode 100755 celescope/dynaseq/Generate_T_C_matrix.R create mode 100755 celescope/dynaseq/__init__.py create mode 100755 celescope/dynaseq/analysis.py create mode 100755 celescope/dynaseq/conversion.py create mode 100755 celescope/dynaseq/multi_dynaseq.py create mode 100755 celescope/dynaseq/replace_tsne.py create mode 100755 celescope/dynaseq/replacement.py create mode 100755 celescope/dynaseq/star.py create mode 100755 celescope/dynaseq/subsitution.py create mode 100755 celescope/templates/html/dynaseq/base.html create mode 100644 celescope/templates/html/dynaseq/replace_tsne_summary.html create mode 100644 celescope/templates/html/dynaseq/replacement_summary.html create mode 100644 celescope/templates/html/dynaseq/subsitution_summary.html create mode 100644 docs/dynaseq/analysis.md create mode 100644 docs/dynaseq/conversion.md create mode 100644 docs/dynaseq/replace_tsne.md create mode 100644 docs/dynaseq/replacement.md create mode 100644 docs/dynaseq/star.md create mode 100644 docs/dynaseq/subsitution.md diff --git a/celescope/__init__.py b/celescope/__init__.py index 28a2454c..906341fc 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -16,6 +16,7 @@ ASSAY_DICT = { 'tag': 'Single-cell tag', 'citeseq': 'Single Cell CITE-Seq', 'tcr_fl': 'Single Cell full length TCR', + 'dynaseq': 'Single Cell Dynaseq' } ROOT_PATH = os.path.dirname(__file__) diff --git a/celescope/dynaseq/Generate_T_C_matrix.R b/celescope/dynaseq/Generate_T_C_matrix.R new file mode 100755 index 00000000..d97535f7 --- /dev/null +++ b/celescope/dynaseq/Generate_T_C_matrix.R @@ -0,0 +1,51 @@ +args <- commandArgs(T) + +require("reshape2") +require("tidyr") +require("dplyr") +require("Matrix") + +my.count1 <- read.table(args[1],h=F) + +my.count1$V1 <- as.character(my.count1$V1) +my.count1$gene <- gsub("--C","",my.count1$V1) +my.count1$gene <- gsub("--T","",my.count1$gene) +cells.keep <- my.count1 %>% dplyr::distinct(V2,V3,gene) %>% group_by(V2) %>% dplyr::summarize(count=n()) %>% arrange(desc(count)) %>% .$V2 %>% as.character + +inds <- as.numeric(args[2]) +if (length(cells.keep) > inds) { + cells.keep2 <- head(cells.keep,inds) +}else{ cells.keep2 <- cells.keep} + +my.count1 <- my.count1 %>% filter(V2 %in% cells.keep2) %>% droplevels +my.count1$type <- "C" +my.count1[grep("--T",my.count1$V1),]$type <- "T" +my.count2 <- dcast(my.count1,gene+V2+V3 ~ type, value.var = "V4") +my.count2[is.na(my.count2)] <- 0 + +if(! "C" %in% colnames(my.count2)) +{ + my.count2$C <- 0; +} +if (ncol(my.count2) !=5) { + stop("Error! Please verify the count data frame!\n"); +} +my.count2 <- my.count2 %>% arrange(gene,V2,V3,C,T) +my.count2 %>% mutate(type = ifelse(C > 0,"C","T")) -> my.count2 +my.count3 <- my.count2 %>% group_by(gene,type,V2) %>% dplyr::summarize(count=n()) +my.count3$gene2 <- paste(my.count3$gene,my.count3$type,sep="--") +my.count3$V2 <- as.factor(my.count3$V2) +my.count3$gene2 <- as.factor(my.count3$gene2) +data.sparse = sparseMatrix(as.integer(my.count3$gene2), as.integer(my.count3$V2), x = my.count3$count) +colnames(data.sparse) = levels(my.count3$V2) +rownames(data.sparse) = levels(my.count3$gene2) +ord <- sort(colSums(data.sparse),decreasing = T) +data.sparse <- data.sparse[,names(ord)] +saveRDS(data.sparse,file=args[3]) +outtsv<-paste(args[3],"tsv", sep = ".") +write.table(as.matrix(data.sparse), file = outtsv, sep = "\t", quote = F, row.names = T) + + + + + diff --git a/celescope/dynaseq/__init__.py b/celescope/dynaseq/__init__.py new file mode 100755 index 00000000..6ed6df1f --- /dev/null +++ b/celescope/dynaseq/__init__.py @@ -0,0 +1,23 @@ +__STEPS__ = [ + 'sample', + 'barcode', + 'cutadapt', + 'star', + "featureCounts", + "count", + 'analysis', + 'conversion', + 'subsitution', + 'replacement', + 'replace_tsne'] + +__ASSAY__ = 'dynaseq' + +# m: memory +# x: thread +RESOURCE = { + 'sample': {'m':1, 'x':1}, + 'barcode': {'m':5, 'x':1}, + 'cutadapt': {'m':5, 'x':1}, + 'star': {'m':30, 'x':1}, +} diff --git a/celescope/dynaseq/analysis.py b/celescope/dynaseq/analysis.py new file mode 100755 index 00000000..ad5ec8f8 --- /dev/null +++ b/celescope/dynaseq/analysis.py @@ -0,0 +1,104 @@ +import pandas as pd + +from celescope.tools.analysis_mixin import AnalysisMixin +from celescope.tools.step import Step +from celescope.tools.utils import add_log, get_id_name_dict, s_common + + +@add_log +def generate_matrix(gtf_file, matrix_file): + + id_name = get_id_name_dict(gtf_file) + matrix = pd.read_csv(matrix_file, sep="\t") + + gene_name_col = matrix.geneID.apply(lambda x: id_name[x]) + matrix.geneID = gene_name_col + matrix = matrix.drop_duplicates(subset=["geneID"], keep="first") + matrix = matrix.dropna() + matrix = matrix.rename({"geneID": ""}, axis='columns') + return matrix + + +class Analysis_dynaseq(Step, AnalysisMixin): + """ + Features + - Cell clustering with Seurat. + + - Calculate the marker gene of each cluster. + + - Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. + + Output + - `markers.tsv` Marker genes of each cluster. + + - `tsne_coord.tsv` t-SNE coordinates and clustering information. + + - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` + parameter is provided. The result contains 3 files: + - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", + it means that the given marker is not enough to identify the cluster. + - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. + - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) + """ + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + AnalysisMixin.__init__(self, args) + self.matrix_file = args.matrix_file + self.genomeDir = args.genomeDir + self.type_marker_tsv = args.type_marker_tsv + self.auto_assign_bool = False + self.save_rds = args.save_rds + if args.type_marker_tsv and args.type_marker_tsv != 'None': + self.auto_assign_bool = True + self.save_rds = True + + def run(self): + self.seurat(self.matrix_file, self.save_rds, self.genomeDir) + if self.auto_assign_bool: + self.auto_assign(self.type_marker_tsv) + self.run_analysis() + self.add_data_item(cluster_tsne=self.cluster_tsne) + self.add_data_item(gene_tsne=self.gene_tsne) + self.add_data_item(table_dict=self.table_dict) + + self.clean_up() + + +@add_log +def analysis(args): + + step_name = "analysis" + ana = Analysis_dynaseq(args, step_name) + ana.run() + + +def get_opts_analysis(parser, sub_program): + + parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True) + parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.') + parser.add_argument( + '--type_marker_tsv', + help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +```""" + ) + if sub_program: + parser.add_argument( + '--matrix_file', + help='Required. Matrix_10X directory from step count.', + required=True, + ) + parser = s_common(parser) + + + diff --git a/celescope/dynaseq/conversion.py b/celescope/dynaseq/conversion.py new file mode 100755 index 00000000..fc300d66 --- /dev/null +++ b/celescope/dynaseq/conversion.py @@ -0,0 +1,267 @@ +#!/usr/bin/env python +# v1.0 + +import pysam +import os +import subprocess +import numpy as np +import pandas as pd +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + + +class Conversion(Step): + """ + Features + - Get conversion pos in each read. + - Get snp info. + + Output + - `{sample}.PosTag.bam` Bam file with conversion info. + - `{sample}.PosTag.csv` SNP info in csv format. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + # input files + self.ifile = os.path.join(args.outdir, args.sample+'.bam') + self.sample = args.sample + self.strandednessfile = args.strand + self.inbam = args.bam + self.bcfile = args.cell + self.outdir = args.outdir + self.thread = args.thread + + # output files + self.outfile_bam = os.path.join(args.outdir, args.sample+'.PosTag.bam') + self.outfile_csv = os.path.join(args.outdir, args.sample+'.PosTag.csv') + + @utils.add_log + def run(self): + ##Filter and sort + self.fltSort(self.inbam,self.ifile,self.bcfile,self.thread) + cmd=['samtools index',self.ifile] + self.run_cmd(cmd) + + ##Adding tags + self.addTags(self.ifile,self.outfile_bam,self.strandednessfile) + cmd=['samtools index',self.outfile_bam] + self.run_cmd(cmd) + + #Obtaining conversion positions + bam = pysam.AlignmentFile(self.outfile_bam, 'rb') + ContigLocs, AnnoteLocs=self.CountConvperPos(bam) + + #Obtaining coverage over conversion position + ConvsPerPos,CoverofPosWithConvs = self.CountReadConverPerConvPos(bam,ContigLocs) + A=self.ExportasVcf(ConvsPerPos,CoverofPosWithConvs,AnnoteLocs) + A['sample'] = self.sample + #Saving result + A.to_csv(self.outfile_csv) + bam.close() + + cmd=['rm', self.ifile] + self.run_cmd(cmd) + cmd=['rm', self.ifile+'.bai'] + self.run_cmd(cmd) + + def run_cmd(self,cmd): + subprocess.call(' '.join(cmd),shell=True) + + @utils.add_log + def CountConvperPos(self,bamfile): + ContigLocs={} + AnnoteLocs={} + for read in bamfile.fetch(): + try: + if read.get_tag('ST')=='+': + locs=read.get_tag('TL') + else: + locs=read.get_tag('AL') + if locs[0]!=0: + if read.reference_name in ContigLocs: + ContigLocs[read.reference_name].extend(locs) + else: + ContigLocs[read.reference_name] = list(locs) + if read.reference_name not in AnnoteLocs: + for i,each in enumerate(locs): + if i == 0: + AnnoteLocs[read.reference_name] = { each :read.get_tag('XT')} + else: + AnnoteLocs[read.reference_name][each] = read.get_tag('XT') + else: + for i,each in enumerate(locs): + if each not in AnnoteLocs[read.reference_name]: + AnnoteLocs[read.reference_name][each] = read.get_tag('XT') + except (ValueError,KeyError): + continue + return ContigLocs, AnnoteLocs + + @utils.add_log + def CountReadConverPerConvPos(self,bam,ContigLocs): + ConvsPerPos={} + CoverofPosWithConvs={} + for key in ContigLocs.keys(): + ContigLocs[key]=sorted(ContigLocs[key]) + ConvsPerPos[key]={} + k=0 + current=ContigLocs[key][k] + k+=1 + nextone=ContigLocs[key][k] + while k < len(ContigLocs[key])-1: + ConvsPerPos[key][current]=1 + while current == nextone and k < len(ContigLocs[key])-1: + k+=1 + nextone=ContigLocs[key][k] + ConvsPerPos[key][current]+=1 + current = nextone + if k < len(ContigLocs[key])-1: + k+=1 + nextone=ContigLocs[key][k] + + CoverofPosWithConvs[key]={} + for key2 in ConvsPerPos[key].keys(): + try: + CoverofPosWithConvs[key][key2]=bam.count(key,key2,key2+1) + except ValueError: + continue + return ConvsPerPos,CoverofPosWithConvs + + @utils.add_log + def ExportasVcf(self,ConvsPerPos,CoverofPosWithConvs, AnnoteLocs): + #Chrom, Pos , ConvsPerPs, CoverofPosWithConvs + Outputdf =pd.DataFrame(columns=['pos2','convs','covers','chrom','posratio']) + for key in ConvsPerPos.keys(): + df=pd.DataFrame.from_dict(ConvsPerPos[key], orient='index') + df1=pd.DataFrame.from_dict(CoverofPosWithConvs[key], orient='index') + df.index.name='pos' + df1.index.name='pos' + df.columns = ['convs'] + df1.columns = ['covers'] + df2=df.join(df1) + df2['pos2'] = df2.index + df2.index = np.arange(df2.shape[0]) + df2['chrom']=np.repeat(key,df2.shape[0]) + df2['posratio']=df2['convs']/df2['covers'] + df3=pd.DataFrame.from_dict(AnnoteLocs[key], orient='index') + df3.columns = ['gene_id'] + df2=df2.join(df3, on='pos2') + Outputdf=Outputdf.append(df2) + return Outputdf.reset_index(drop=True) + + def createTag(self,d): + return ''.join([''.join(key) + str(d[key]) + ';' for key in d.keys()])[:-1] + + + def convInRead(self, read, qual = 20): + specific_conversions = {} + total_content = {'a' : 0, 'c' : 0, 'g' : 0, 't' : 0} + specific_conversions[('c', 'A')] = 0 + specific_conversions[('g', 'A')] = 0 + specific_conversions[('t', 'A')] = 0 + specific_conversions[('a', 'C')] = 0 + specific_conversions[('g', 'C')] = 0 + specific_conversions[('t', 'C')] = 0 + specific_conversions[('a', 'G')] = 0 + specific_conversions[('c', 'G')] = 0 + specific_conversions[('t', 'G')] = 0 + specific_conversions[('a', 'T')] = 0 + specific_conversions[('c', 'T')] = 0 + specific_conversions[('g', 'T')] = 0 + specific_conversions[('a', 'N')] = 0 + specific_conversions[('c', 'N')] = 0 + specific_conversions[('g', 'N')] = 0 + specific_conversions[('t', 'N')] = 0 + + tC_loc = [] + aG_loc = [] + + try: + refseq = read.get_reference_sequence().lower() + except (UnicodeDecodeError): + refseq='' + + for base in total_content.keys(): + total_content[base] += refseq.count(base) + for pair in read.get_aligned_pairs(with_seq=True): + try: + if pair[0] is not None and pair[1] is not None and pair[2] is not None: + if str(pair[2]).islower() and not read.query_qualities[pair[0]] < qual: + specific_conversions[(pair[2],read.seq[pair[0]])] += 1 + if (pair[2],read.seq[pair[0]]) == ('t', 'C'): + tC_loc.append(pair[1]) + if (pair[2],read.seq[pair[0]]) == ('a', 'G'): + aG_loc.append(pair[1]) + except (UnicodeDecodeError, KeyError): + continue + SC_tag = self.createTag(specific_conversions) + TC_tag = self.createTag(total_content) + + if len(tC_loc) == 0: + tC_loc.append(0) + if len(aG_loc) == 0: + aG_loc.append(0) + return SC_tag, TC_tag, tC_loc, aG_loc + + @utils.add_log + def addTags(self,bamfilename, outputname,strandednessfile): + bamfile = pysam.AlignmentFile(bamfilename, 'rb') + mod_bamfile = pysam.AlignmentFile(outputname, mode='wb',template=bamfile) + strandedness = pd.read_csv(strandednessfile, header=None, index_col=0) + for read in bamfile.fetch(): + try: + tags = self.convInRead(read) + read.set_tag('SC',tags[0],'Z') + read.set_tag('TC',tags[1],'Z') + read.set_tag('TL',tags[2]) + read.set_tag('AL',tags[3]) + read.set_tag('ST',strandedness.loc[read.get_tag('XT')][1]) + mod_bamfile.write(read) + except (ValueError,KeyError): + continue + + bamfile.close() + mod_bamfile.close() + + @utils.add_log + def fltSort(self,bamfilename, outfile_bam,cellfile, thread=8): + bamfile = pysam.AlignmentFile(bamfilename, 'rb') + mod_bamfile = pysam.AlignmentFile(outfile_bam, mode='wb',template=bamfile) + cells={} + with open(cellfile) as f: + for i in f: + cells[i.strip()] = 1 + for read in bamfile.fetch(until_eof=True): + try: + if not read.has_tag('GX'): continue + if read.get_tag("CB") not in cells: continue + mod_bamfile.write(read) + except (ValueError,KeyError): + continue + bamfile.close() + mod_bamfile.close() + + cmd=['samtools sort -@',str(thread), '-o', outfile_bam+'.bam',outfile_bam] + self.run_cmd(cmd) + cmd=['mv',outfile_bam+'.bam',outfile_bam] + self.run_cmd(cmd) + + + + + +@utils.add_log +def conversion(args): + + step_name = "conversion" + conversion_obj = Conversion(args, step_name) + conversion_obj.run() + +def get_opts_conversion(parser, sub_program): + parser.add_argument('--strand', help='gene strand file', required=True) + if sub_program: + parser.add_argument("--bam", help='featureCount bam', required=True) + parser.add_argument("--cell", help='barcode cell list', required=True) + parser = s_common(parser) + return parser + diff --git a/celescope/dynaseq/multi_dynaseq.py b/celescope/dynaseq/multi_dynaseq.py new file mode 100755 index 00000000..ed6795f5 --- /dev/null +++ b/celescope/dynaseq/multi_dynaseq.py @@ -0,0 +1,63 @@ +from celescope.dynaseq.__init__ import __ASSAY__ +from celescope.tools.multi import Multi + + +class Multi_dynaseq(Multi): + + def conversion(self, sample): + step = 'conversion' + bam = f'{self.outdir_dic[sample]["featureCounts"]}/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam' + cell = f'{self.outdir_dic[sample]["count"]}/{sample}_matrix_10X/barcodes.tsv' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--bam {bam} ' + f'--cell {cell} ' + ) + self.process_cmd(cmd, step, sample, m=5, x=1) + + def subsitution(self, sample): + step = 'subsitution' + bam = f'{self.outdir_dic[sample]["conversion"]}/{sample}.PosTag.bam' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--bam {bam} ' + ) + self.process_cmd(cmd, step, sample, m=1, x=1) + + + def replacement(self, sample): + step = 'replacement' + bam = f'{self.outdir_dic[sample]["conversion"]}/{sample}.PosTag.bam' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--bam {bam} ' + f'--bg {self.col5_dict[sample]} ' + ) + self.process_cmd(cmd, step, sample, m=10, x=1) + + + def replace_tsne(self, sample): + step = 'replace_tsne' + tsne_file = f'{self.outdir_dic[sample]["analysis"]}/{sample}_tsne_coord.tsv' + mat_file = f'{self.outdir_dic[sample]["replacement"]}/{sample}.fraction_of_newRNA_matrix.txt' + rep_file = f'{self.outdir_dic[sample]["replacement"]}/{sample}.fraction_of_newRNA_per_cell.txt' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line} ' + f'--tsne {tsne_file} ' + f'--mat {mat_file} ' + f'--rep {rep_file} ' + ) + self.process_cmd(cmd, step, sample, m=1, x=1) + + +def main(): + multi = Multi_dynaseq(__ASSAY__) + multi.run() + +if __name__ == '__main__': + main() + diff --git a/celescope/dynaseq/replace_tsne.py b/celescope/dynaseq/replace_tsne.py new file mode 100755 index 00000000..a0b84eb1 --- /dev/null +++ b/celescope/dynaseq/replace_tsne.py @@ -0,0 +1,177 @@ +#!/bin/env python +# coding=utf8 + +import os +import pandas as pd +import plotly +import plotly.graph_objects as go +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + + + +class Replace_tsne(Step): + """ + Features + - Replace rate in each cluster + - Top replace genes in each cluster + + Output + - `{sample}.rep_in_tsne.txt` Replace rate in each cluster. + - `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + # input files + self.sample = args.sample + self.tsnefile = args.tsne + self.matfile = args.mat + self.repfile = args.rep + self.mincell = args.mincell + self.topgene = args.topgene + # output files + self.outdot = os.path.join(self.outdir, self.sample+'.rep_in_tsne.txt') + self.outtbl = os.path.join(self.outdir, self.sample+'.rep_in_tsne_top10.txt') + + @utils.add_log + def run(self): + # rep in cells in cluster + self.dot_tsne(self.repfile,self.tsnefile,self.outdot) + div_item = self.tsne_plot(self.outdot) + # high rep gene in each cluster + self.top_gene_cluster(self.matfile,self.tsnefile,self.outtbl,self.mincell,self.topgene) + tbltxt = pd.read_csv(self.outtbl,header=0,sep="\t") + tbldiv = self.tsne_table(tbltxt) + + # report + self.report_prepare(div_item, tbldiv) + self.clean_up() + + + + @utils.add_log + def dot_tsne(self,repfile,tsnefile,outfile): + cells = {} + with open(repfile, 'r') as f: + for i in f: + ii = i.strip().split() + cells[ii[0]] = ii[1] + + outf = open(outfile, 'w') + outf.write("Cell\ttSNE_1\ttSNE_2\tCluster\tratio\n") + with open(tsnefile, 'r') as f: + f.readline() + for i in f: + ii = i.strip().split() + if ii[0] in cells: + outl = '\t'.join(ii[0:4])+'\t'+cells[ii[0]]+'\n' + else: + outl = '\t'.join(ii[0:4])+'\t0'+'\n' + outf.write(outl) + outf.close() + + @utils.add_log + def tsne_plot(self,txt): + df = pd.read_table(txt) + df.sort_values(by="ratio") + newtitle="t-SNE plot Colored by RNA Turn-over rate" + + fig = go.Figure() + fig.add_trace(go.Scatter(x=df['tSNE_1'], y=df['tSNE_2'], mode='markers', + marker_opacity=0.9,marker_size=4,marker_color=df['ratio'], + marker_colorscale="PuBu", marker_showscale=True, + )) + fig.update_layout(height=600, width=600,title_text=newtitle) + fig.update_layout(plot_bgcolor = '#FFFFFF') + fig.update_xaxes(showgrid=False,linecolor='black', showline=True, ticks='outside',title_text='t-SNE1') + fig.update_yaxes(showgrid=False,linecolor='black', showline=True, ticks='outside',title_text='t-SNE2') + + div = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div') + + return div + + + def tsne_table(self,txt): + marker_gene_table = txt.to_html( + escape=False, + index=False, + table_id='replacement_table_cluster', + justify="center") + + return marker_gene_table + + + def file_stat(self,infile,clu): + clus = list(set(clu.values())) + cluster = {} + for c in clus: + cluster[c] = {} + fn = open(infile,"r") + fnh = fn.readline().strip().split() + for i in fn: + ii = i.strip().split() + for j in range(1,len(ii)): + if ii[j] == 'NA': continue + if fnh[j] not in clu: continue + if ii[0] not in cluster[clu[fnh[j]]]: + cluster[clu[fnh[j]]][ii[0]] = [] + cluster[clu[fnh[j]]][ii[0]].append(float(ii[j])) + fn.close() + return cluster + + def tsne_file(self,infile): + clu = {} + with open(infile) as f: + f.readline() + for i in f: + ii = i.strip().split() + clu[ii[0]] = ii[3] + return clu + + + @utils.add_log + def top_gene_cluster(self,matrix,tsnefile,outfile,mincell=5,topgene=10): + tsne = self.tsne_file(tsnefile) + cluster = self.file_stat(matrix,tsne) + + w = open(outfile,'w') + w.write("cluster\tgene\tTurn-over_rate\tcells\n") + for c in cluster: + tmp = {} + for g in cluster[c]: + gt = sum(cluster[c][g]) / len(cluster[c][g]) + tmp[g] = gt + sorttmp = sorted(tmp.items(), key=lambda item:item[1], reverse=True) + tmpn = 0 + for x in sorttmp: + if len(cluster[c][x[0]]) < mincell: continue + tmpn += 1 + if tmpn > topgene: break + w.write('cluster'+c+'\t'+x[0]+'\t'+str(x[1])+'\t'+str(len(cluster[c][x[0]]))+'\n') + w.close() + + + def report_prepare(self,outdiv, outable): + self.add_data_item(replace_tsne=outdiv) + self.add_data_item(replace_tsne_table=outable) + + +@utils.add_log +def replace_tsne(args): + + step_name = "replace_tsne" + replace_tsne_obj = Replace_tsne(args, step_name) + replace_tsne_obj.run() + +def get_opts_replace_tsne(parser, sub_program): + if sub_program: + parser.add_argument('--tsne', help='tsne file', required=True) + parser.add_argument('--mat', help='matrix rep file', required=True) + parser.add_argument('--rep', help='cell rep file', required=True) + parser.add_argument('--mincell', type=int, default=5, help='turn-over in at least cells, default 5') + parser.add_argument('--topgene', type=int, default=10, help='top N genes,default 10') + parser = s_common(parser) + return parser + diff --git a/celescope/dynaseq/replacement.py b/celescope/dynaseq/replacement.py new file mode 100755 index 00000000..c8ac2a15 --- /dev/null +++ b/celescope/dynaseq/replacement.py @@ -0,0 +1,331 @@ +#!/bin/env python +# coding=utf8 + +import os +import sys +import subprocess +import pandas as pd +import pysam +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + +toolsdir = os.path.dirname(__file__) + + +class Replacement(Step): + """ + Features + - Computes the replacement rates in each cell and gene. + - Boxplots for rates distribution. + + Output + - `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi. + - `{sample}.new_matrix.tsv.gz` New RNA matrix. + - `{sample}.old_matrix.tsv.gz` Old RNA matrix. + - `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell. + - `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene. + - `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + # input files + self.outdir = args.outdir + self.sample = args.sample + self.bam_file = args.bam + self.snp_file = args.bg + self.bg_cov = args.bg_cov + self.cell_keep = args.cell_keep + # output files + self.outread = os.path.join(self.outdir, self.sample+'.corrected_gene_cell_UMI_read.txt') + self.outrds = os.path.join(self.outdir, self.sample+'.TC_matrix.rds') + self.outpre = os.path.join(self.outdir,self.sample) + + @utils.add_log + def run(self): + # get backgroud snp + bg = self.background_snp(self.snp_file,self.bg_cov) + # get reads with TC + self.extract_dem(self.bam_file,self.outread,bg) + # run_R + self.generate_TC_matrix(self.outread, self.outrds, self.cell_keep) + + # split to New and Old Matrix + totMat = self.outrds+'.tsv' + new_mat = self.outpre+'.new_matrix.tsv' + old_mat = self.outpre+'.old_matrix.tsv' + con_mat = self.outpre+'.NvsO_matrix.tsv' + self.split_matrix(totMat,self.outpre) + + # replacement stat + self.replacment_stat(con_mat,self.outpre) + # plot + div_item = self.replacment_plot(self.outpre) + + # report + self.report_prepare(div_item) + self.clean_up() + + # clean + cmd=['rm', self.outread] + self.run_cmd(cmd) + cmd=['rm', self.outrds+'.tsv'] + self.run_cmd(cmd) + cmd=['rm', con_mat] + self.run_cmd(cmd) + cmd=['gzip', new_mat] + self.run_cmd(cmd) + cmd=['gzip', old_mat] + self.run_cmd(cmd) + + def run_cmd(self,cmd): + subprocess.call(' '.join(cmd),shell=True) + + @utils.add_log + def extract_dem(self,bam,outfile,bg): + bamfile = pysam.AlignmentFile(bam, 'rb') + countdict = {} + for read in bamfile.fetch(): + try: + chro = read.reference_name + cb = read.get_tag('CB') + ub = read.get_tag('UB') + if not read.has_tag('GN'): continue + gene = read.get_tag('GN') + + if read.get_tag('ST') == '+': + stag = read.get_tag('TL') + else: + stag = read.get_tag('AL') + if len(stag)==1 and stag[0]==0: + gene += '--T' + else: + fcount = 0 + for si in range(0,len(stag)): + pos = chro + '_' + str(stag[si]) + if pos in bg: + fcount += 1 + if fcount == len(stag): + gene += '--T' + else: + gene += '--C' + + readinfo = '\t'.join([gene,cb,ub]) + if readinfo not in countdict: + countdict[readinfo] = 1 + else: + countdict[readinfo] += 1 + + except (ValueError,KeyError): + continue + bamfile.close() + + out1 = open(outfile,'w') + for rid in countdict: + out1.write(rid+'\t'+str(countdict[rid])+'\n') + out1.close() + + @utils.add_log + def background_snp(self,bgfile,cov=1): + outdict = {} + if bgfile.endswith('.csv'): + with open(bgfile) as f: + f.readline() + for i in f: + ii = i.strip().split(',') + if int(ii[2]) {self.ribo_run_log} 2>&1 ' + ) + Star_dynaseq.ribo.logger.info(cmd) + subprocess.check_call(cmd, shell=True) + + @utils.add_log + def picard(self): + cmd = [ + 'picard', + '-Xmx20G', + '-XX:ParallelGCThreads=4', + 'CollectRnaSeqMetrics', + 'I=%s' % (self.STAR_bam), + 'O=%s' % (self.picard_region_log), + 'REF_FLAT=%s' % (self.refflat), + 'STRAND=NONE', + 'VALIDATION_STRINGENCY=SILENT'] + cmd_str = ' '.join(cmd) + Star_dynaseq.picard.logger.info(cmd_str) + subprocess.check_call(cmd) + + @utils.add_log + def run(self): + self.run_star() + self.picard() + if self.debug: + self.ribo() + self.add_other_metrics() + self.clean_up() + + +def star(args): + step_name = "star" + runner = Star_dynaseq(args, step_name) + runner.run() + + +def get_opts_star(parser, sub_program): + get_opts_star_mixin(parser, sub_program) diff --git a/celescope/dynaseq/subsitution.py b/celescope/dynaseq/subsitution.py new file mode 100755 index 00000000..44f251be --- /dev/null +++ b/celescope/dynaseq/subsitution.py @@ -0,0 +1,208 @@ +#!/bin/env python +# coding=utf8 + +import os +import pysam +import re +import pandas as pd +import plotly +import plotly.graph_objects as go +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + + +class Subsitution(Step): + """ + Features + - Computes the overall conversion rates in reads and plots a barplot. + + Output + - `{sample}.substitution.txt` Tab-separated table of the overall conversion rates. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + # input files + self.sample = args.sample + self.bam_file = args.bam + self.outdir = args.outdir + + # output files + self.outstat = os.path.join(self.outdir, self.sample+'.substitution.txt') + + + @utils.add_log + def run(self): + # overall rate + for_base,rev_base,is_forward,is_reverse = self.get_sub_tag(self.bam_file) + self.sub_stat(for_base,rev_base,is_forward,is_reverse,self.outstat) + div_item = self.sub_plot(self.outstat) + + self.report_prepare(div_item) + self.clean_up() + + + @utils.add_log + def get_sub_tag(self,bam): + bamfile = pysam.AlignmentFile(bam, 'rb') + is_reverse = {'cA':0, 'gA':0, 'tA':0, 'aC':0, 'gC':0, 'tC':0, 'aG':0, 'cG':0, 'tG':0, 'aT':0, 'cT':0, 'gT':0} + is_forward = {'cA':0, 'gA':0, 'tA':0, 'aC':0, 'gC':0, 'tC':0, 'aG':0, 'cG':0, 'tG':0, 'aT':0, 'cT':0, 'gT':0} + for_base = {'a':0, 'c':0, 'g':0, 't':0} + rev_base = {'a':0, 'c':0, 'g':0, 't':0} + snp_tags = ['','cA', 'gA', 'tA', 'aC', 'gC', 'tC', 'aG', 'cG', 'tG', 'aT', 'cT', 'gT'] + ref_tags = ['','a','c','g','t'] + for read in bamfile.fetch(): + try: + snpmatch = re.match( r'cA(\d+);gA(\d+);tA(\d+);aC(\d+);gC(\d+);tC(\d+);aG(\d+);cG(\d+);tG(\d+);aT(\d+);cT(\d+);gT(\d+);', read.get_tag('SC'), re.M) + totmatch = re.match( r'a(\d+);c(\d+);g(\d+);t(\d+)', read.get_tag('TC'), re.M) + if snpmatch and totmatch: + if read.is_reverse: + for j in range(1,len(ref_tags)): + rev_base[ref_tags[j]] += int(totmatch.group(j)) + for i in range(1,len(snp_tags)): + is_reverse[snp_tags[i]] += int(snpmatch.group(i)) + else: + for j in range(1,len(ref_tags)): + for_base[ref_tags[j]] += int(totmatch.group(j)) + for i in range(1,len(snp_tags)): + is_forward[snp_tags[i]] += int(snpmatch.group(i)) + except (ValueError,KeyError): + continue + bamfile.close() + + return for_base,rev_base,is_forward,is_reverse + + @utils.add_log + def sub_stat(self,for_base,rev_base,is_forward,is_reverse,outfile): + convertdict = {'a':['aC','aG','aT'], + 'c':['cA','cG','cT'], + 'g':['gA','gC','gT'], + 't':['tA','tC','tG']} + subdict = {'a':'t','t':'a','c':'g','g':'c', + 'aC':'tG','aG':'tC','aT':'tA', + 'cA':'gT','cG':'gC','cT':'gA', + 'gA':'cT','gC':'cG','gT':'cA', + 'tA':'aT','tC':'aG','tG':'aC'} + outdict = {'aC':'A_to_C','aG':'A_to_G','aT':'A_to_T', + 'cA':'C_to_A','cG':'C_to_G','cT':'C_to_T', + 'gA':'G_to_A','gC':'G_to_C','gT':'G_to_T', + 'tA':'T_to_A','tC':'T_to_C','tG':'T_to_G'} + outw = open(outfile,'w') + for x in ['a','c','g','t']: + fbase = for_base[x] + rbase = rev_base[subdict[x]] + for y in convertdict[x]: + fcov = is_forward[y]*100 / float(fbase) + rcov = is_reverse[subdict[y]]*100 / float(rbase) + outw.write(outdict[y]+'\t'+"%.3f"%fcov+'\t'+"%.3f"%rcov+'\n') + outw.close() + + @utils.add_log + def sub_plot(self,txt): + df = pd.read_table(txt, header=None) + df.columns = ['sample', '+', '-'] + + fig = go.Figure() + ## 设置颜色: + import plotly.express as px + num4colors = 0 + num4rainbow = 0 + colors_list = [] + while num4colors<100: + if num4rainbow == 9: + num4rainbow = 0 + colors_list.append(px.colors.qualitative.Plotly[num4rainbow]) + num4colors+=1 + num4rainbow+=1 + + num4sample = 0 + colors4sample = {} + num4x = 0 + + for sample in df['sample'].unique(): + legend_show = True + colors4sample[sample] = colors_list[num4sample] + num4sample += 1 + flag_x = 'x' + str(num4x+1) + df_plot = df[ df['sample'] == sample ] + num4x+=1 + + fig.add_trace(go.Bar(name=sample+'+', + x=df_plot['sample'], + y=df_plot['+'], + legendgroup=sample, + marker_color=colors4sample[sample], + marker_line_color='#FFFFFF', + showlegend=legend_show, + xaxis=flag_x) + ) + fig.add_trace(go.Bar(name=sample+'-', + x=df_plot['sample'], + y=df_plot['-'], + legendgroup=sample, + showlegend=legend_show, + marker_color=colors4sample[sample], + marker_line_color='#FFFFFF', + opacity=0.3, + xaxis=flag_x) + ) + + fig.update_layout(barmode='stack') + + per = 1/(num4x+1) + gap4bar = per/len(df['sample'].unique()) + num4x = 0 + for typeB in df['sample'].unique(): + if num4x == 0: + flag_x = 'xaxis' + else: + flag_x = 'xaxis' + str(num4x+1) + anchor_x = 'x'+str(num4x+1) + num4x += 1 + fig['layout'][flag_x] = dict(domain=[per*num4x, per*(num4x+1)-gap4bar], anchor=anchor_x, title=typeB) + + fig.update_layout(plot_bgcolor = '#FFFFFF') + fig.update_xaxes(showgrid=False, linecolor='black', showline=True, ticks='outside', showticklabels=False) + fig.update_yaxes(showgrid=False, linecolor='black', showline=True, ticks='outside') + width_num = 400 * ( len(df['sample'].unique())* len(df['sample'].unique()) ) / (5*12) ## 控制柱形图的宽度 + fig.update_layout(height=500, width=width_num) + fig.update_layout(legend=dict(orientation="h")) + fig.update_layout(legend=dict( + yanchor="top", + y=1.3, + xanchor="left", + x=0.05, + valign="top", + )) + + fig.update_layout( + yaxis_title="Rates of nucleotide substitution (%)", + ) + fig.update_xaxes( + tickangle = -80, + title_font = {"size": 15}, + title_standoff = 25 + ) + + div = plotly.offline.plot(fig, include_plotlyjs=False, output_type='div') + + return div + + + def report_prepare(self,outdiv): + self.add_data_item(subsitution=outdiv) + + +@utils.add_log +def subsitution(args): + + step_name = "subsitution" + subsitution_obj = Subsitution(args, step_name) + subsitution_obj.run() + +def get_opts_subsitution(parser, sub_program): + if sub_program: + parser.add_argument('--bam', help='bam file', required=True) + parser = s_common(parser) + return parser diff --git a/celescope/templates/html/dynaseq/base.html b/celescope/templates/html/dynaseq/base.html new file mode 100755 index 00000000..e7adebfd --- /dev/null +++ b/celescope/templates/html/dynaseq/base.html @@ -0,0 +1,156 @@ + + + + + + report + + + + + + + + + + + + + +
+

DynaSCOPE Report

+ + {% if sample_summary is defined %} + {% include "html/common/sample_summary.html"%} + {% endif %} + + {% if barcode_summary is defined %} + {% include "html/common/barcode_summary.html"%} + {% endif %} + + {% if cutadapt_summary is defined %} + {% include "html/common/cutadapt_summary.html"%} + {% endif %} + + {% if star_summary is defined %} + {% include "html/rna/star_summary.html"%} + {% endif %} + + {% if featureCounts_summary is defined%} + {% include "html/rna/featureCounts_summary.html"%} + {% endif %} + + {% if umi_summary is defined %} + {% include "html/rna/umi_summary.html"%} + {% endif %} + + {% if cluster_tsne is defined %} + {% include "html/rna/analysis_summary.html"%} + {% endif %} + + {% if subsitution is defined %} + {% include "html/dynaseq/subsitution_summary.html"%} + {% endif %} + + {% if replacement is defined %} + {% include "html/dynaseq/replacement_summary.html"%} + {% endif %} + + {% if replace_tsne is defined %} + {% include "html/dynaseq/replace_tsne_summary.html"%} + {% endif %} + +
+ + + + diff --git a/celescope/templates/html/dynaseq/replace_tsne_summary.html b/celescope/templates/html/dynaseq/replace_tsne_summary.html new file mode 100644 index 00000000..d1f927e5 --- /dev/null +++ b/celescope/templates/html/dynaseq/replace_tsne_summary.html @@ -0,0 +1,54 @@ +

RNA Turn-over rate in clusters

+ +{{ replace_tsne|safe }} + + + + +

High Turn-over Genes in Cluster

+
+ + {{ replace_tsne_table|safe }} + + + +
+ +
+ + + diff --git a/celescope/templates/html/dynaseq/replacement_summary.html b/celescope/templates/html/dynaseq/replacement_summary.html new file mode 100644 index 00000000..bcf91dbd --- /dev/null +++ b/celescope/templates/html/dynaseq/replacement_summary.html @@ -0,0 +1,3 @@ +

RNA Turn-over rate

+ +{{ replacement|safe }} diff --git a/celescope/templates/html/dynaseq/subsitution_summary.html b/celescope/templates/html/dynaseq/subsitution_summary.html new file mode 100644 index 00000000..c774021f --- /dev/null +++ b/celescope/templates/html/dynaseq/subsitution_summary.html @@ -0,0 +1,13 @@ + +
+

Dynaseq Analysis

+
+ + + +

Sample substitution rate

+{{ subsitution|safe }} + diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index ae938ad0..306488e2 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -79,6 +79,7 @@ class Multi(): def parse_map_col4(mapfile, default_val): fq_dict = defaultdict(list) col4_dict = {} + col5_dict = {} with open(mapfile) as fh: for line in fh: line = line.strip() @@ -86,7 +87,7 @@ class Multi(): continue line_split = line.split() library_id, library_path, sample_name = line_split[:3] - if len(line_split) == 4: + if len(line_split) >= 4: col4 = line_split[3] else: col4 = default_val @@ -98,7 +99,8 @@ class Multi(): else: fq_dict[sample_name] = [[fq1], [fq2]] col4_dict[sample_name] = col4 - + if len(line_split) == 5: + col5_dict[sample_name] = line_split[4] for sample_name in fq_dict: fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0]) @@ -106,7 +108,7 @@ class Multi(): if not fq_dict: raise Exception('empty mapfile!') - return fq_dict, col4_dict + return fq_dict, col4_dict,col5_dict def link_data(self): raw_dir = f'{self.args.outdir}/data_give/rawdata' @@ -122,7 +124,7 @@ class Multi(): parse_mapfile, link data, make log dir, init script variables, init outdir_dic """ # parse_mapfile - self.fq_dict, self.col4_dict = self.parse_map_col4(self.args.mapfile, self.col4_default) + self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default) # link self.link_data() diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 6baa77b6..f8e73fde 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -437,6 +437,7 @@ def get_fq(library_id, library_path): def parse_map_col4(mapfile, default_val): fq_dict = defaultdict(list) col4_dict = defaultdict(list) + col5_dict = defaultdict(list) with open(mapfile) as fh: for line in fh: line = line.strip() @@ -448,7 +449,7 @@ def parse_map_col4(mapfile, default_val): library_id = tmp[0] library_path = tmp[1] sample_name = tmp[2] - if len(tmp) == 4: + if len(tmp) >= 4: col4 = tmp[3] else: col4 = default_val @@ -461,6 +462,8 @@ def parse_map_col4(mapfile, default_val): fq_dict[sample_name] = [[fq1], [fq2]] if col4 and col4 != default_val: col4_dict[sample_name] = col4 + if len(tmp) == 5: + col5_dict[sample_name] = tmp[4] for sample_name in fq_dict: fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0]) @@ -468,7 +471,7 @@ def parse_map_col4(mapfile, default_val): if not fq_dict: raise Exception('empty mapfile!') - return fq_dict, col4_dict + return fq_dict, col4_dict, col5_dict def generate_sjm(cmd, name, conda, m=1, x=1): diff --git a/docs/dynaseq/analysis.md b/docs/dynaseq/analysis.md new file mode 100644 index 00000000..d2a38bc9 --- /dev/null +++ b/docs/dynaseq/analysis.md @@ -0,0 +1,51 @@ +## Features +- Cell clustering with Seurat. + +- Calculate the marker gene of each cluster. + +- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. + +## Output +- `markers.tsv` Marker genes of each cluster. + +- `tsne_coord.tsv` t-SNE coordinates and clustering information. + +- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` +parameter is provided. The result contains 3 files: + - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", +it means that the given marker is not enough to identify the cluster. + - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. + - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--save_rds` Write rds to disk. + +`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +``` + +`--matrix_file` Required. Matrix_10X directory from step count. + +`--outdir` output dir + +`--assay` assay + +`--sample` sample name + +`--thread` None + +`--debug` debug + diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md new file mode 100644 index 00000000..bfb0cb2a --- /dev/null +++ b/docs/dynaseq/conversion.md @@ -0,0 +1,26 @@ +## Features +- Get conversion pos in each read. + - Get snp info. + +## Output +- `{sample}.PosTag.bam` Bam file with conversion info. +- `{sample}.PosTag.csv` SNP info in csv format. + + +## Arguments +`--strand` gene strand file + +`--bam` featureCount bam + +`--cell` barcode cell list + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md new file mode 100644 index 00000000..31ed90ce --- /dev/null +++ b/docs/dynaseq/replace_tsne.md @@ -0,0 +1,30 @@ +## Features +- Replace rate in each cluster +- Top replace genes in each cluster + +## Output +- `{sample}.rep_in_tsne.txt` Replace rate in each cluster. +- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster. + + +## Arguments +`--tsne` tsne file + +`--mat` matrix rep file + +`--rep` cell rep file + +`--mincell` turn-over in at least cells, default 5 + +`--topgene` top N genes,default 10 + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md new file mode 100644 index 00000000..1184777c --- /dev/null +++ b/docs/dynaseq/replacement.md @@ -0,0 +1,36 @@ +## Features +- Computes the replacement rates in each cell and gene. +- Boxplots for rates distribution. + +## Output +- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi. +- `{sample}.new_matrix.tsv.gz` New RNA matrix. +- `{sample}.old_matrix.tsv.gz` Old RNA matrix. +- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell. +- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene. +- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene. + + +## Arguments +`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format + +`--bam` bam file + +`--bg` background snp file + +`--cell_keep` filter cell + +`--min_cell` a gene expressed in at least cells, default 10 + +`--min_gene` at least gene num in a cell, default 10 + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/star.md b/docs/dynaseq/star.md new file mode 100644 index 00000000..ec3b5211 --- /dev/null +++ b/docs/dynaseq/star.md @@ -0,0 +1,56 @@ +## Features +- Align R2 reads to the reference genome with STAR. +- Collect Metrics with Picard. + +## Output +- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. + +- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. + +- `{sample}_Log.out` Main log with a lot of detailed information about the run. +This is most useful for troubleshooting and debugging. + +- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, +% of mapped reads etc. It is updated in 1 minute intervals. + +- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, +very useful for quality control. The statistics are calculated for each read (single- or paired-end) and +then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, +(unlike the samtools agstat/idxstats, which count each mate separately). +Most of the information is collected about the UNIQUE mappers +(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). +Each splicing is counted in the numbers of splices, which would correspond to +summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, +i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. + +- `{sample}_region.log` Picard CollectRnaSeqMetrics results. + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--fq` Required. R2 fastq file. + +`--consensus_fq` Input fastq has been consensused + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md new file mode 100644 index 00000000..e2b7b169 --- /dev/null +++ b/docs/dynaseq/subsitution.md @@ -0,0 +1,20 @@ +## Features +- Computes the overall conversion rates in reads and plots a barplot. + +## Output +- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates. + + +## Arguments +`--bam` bam file + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + -- Gitee From 91de4f07f43dc6954d6fdafd5651a2fca92bb9c8 Mon Sep 17 00:00:00 2001 From: Tony Zhou Date: Thu, 24 Jun 2021 09:45:37 +0800 Subject: [PATCH 75/96] fix pysam.VariantFile --- celescope/dynaseq/replacement.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/celescope/dynaseq/replacement.py b/celescope/dynaseq/replacement.py index c8ac2a15..8c6a8119 100755 --- a/celescope/dynaseq/replacement.py +++ b/celescope/dynaseq/replacement.py @@ -138,8 +138,7 @@ class Replacement(Step): chr_pos = ii[1]+'_'+ii[5] outdict[chr_pos] = 1 elif bgfile.endswith('.vcf'): - from pysam import VariantFile - bcf_in = VariantFile(bgfile) + bcf_in = pysam.VariantFile(bgfile) for rec in bcf_in.fetch(): try: chrom, pos = rec.chrom, rec.pos -- Gitee From 89838794667a08bad5bcc2797d853afa151ae4ec Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 10:16:16 +0800 Subject: [PATCH 76/96] change tests folder position --- docs/CONTRIBUTING.md | 4 ++-- setup.sh | 7 ------- {celescope/tests => tests}/__init__.py | 0 {celescope/tests => tests}/conftest.py | 0 {celescope/tests => tests}/test_function.py | 0 {celescope/tests => tests}/test_multi.py | 0 6 files changed, 2 insertions(+), 9 deletions(-) delete mode 100755 setup.sh rename {celescope/tests => tests}/__init__.py (100%) rename {celescope/tests => tests}/conftest.py (100%) rename {celescope/tests => tests}/test_function.py (100%) rename {celescope/tests => tests}/test_multi.py (100%) diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md index 782c33ec..fb9ee986 100644 --- a/docs/CONTRIBUTING.md +++ b/docs/CONTRIBUTING.md @@ -98,9 +98,9 @@ git clone https://github.com/singleron-RD/celescope_tests.git Install pytest >>> pip install pytest Run all ->>> pytest -s celescope/tests/test_multi.py --test_dir {test_dir} +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} Run some tests ->>> pytest -s celescope/tests/test_multi.py --test_dir {test_dir} --assays rna,tag +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag ``` Then you need to create your own test based on this example. \ No newline at end of file diff --git a/setup.sh b/setup.sh deleted file mode 100755 index 082ee59b..00000000 --- a/setup.sh +++ /dev/null @@ -1,7 +0,0 @@ -git clone https://github.com/singleron-RD/CeleScope.git - -conda create -n celescope -conda activate celescope -conda install --file conda_pkgs.txt --channel conda-forge --channel bioconda --channel r --channel imperial-college-research-computing - -pip install celescope diff --git a/celescope/tests/__init__.py b/tests/__init__.py similarity index 100% rename from celescope/tests/__init__.py rename to tests/__init__.py diff --git a/celescope/tests/conftest.py b/tests/conftest.py similarity index 100% rename from celescope/tests/conftest.py rename to tests/conftest.py diff --git a/celescope/tests/test_function.py b/tests/test_function.py similarity index 100% rename from celescope/tests/test_function.py rename to tests/test_function.py diff --git a/celescope/tests/test_multi.py b/tests/test_multi.py similarity index 100% rename from celescope/tests/test_multi.py rename to tests/test_multi.py -- Gitee From 41972ec591a897d4cba5e47d8b71ebcc7af83624 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 10:44:05 +0800 Subject: [PATCH 77/96] remove duplicated star and analysis from dynaseq --- celescope/dynaseq/__init__.py | 5 ++ celescope/dynaseq/analysis.py | 104 ---------------------- celescope/dynaseq/star.py | 163 ---------------------------------- tests/test_multi.py | 1 + 4 files changed, 6 insertions(+), 267 deletions(-) delete mode 100755 celescope/dynaseq/analysis.py delete mode 100755 celescope/dynaseq/star.py diff --git a/celescope/dynaseq/__init__.py b/celescope/dynaseq/__init__.py index 6ed6df1f..ae7742a4 100755 --- a/celescope/dynaseq/__init__.py +++ b/celescope/dynaseq/__init__.py @@ -13,6 +13,11 @@ __STEPS__ = [ __ASSAY__ = 'dynaseq' +IMPORT_DICT = { + 'star': 'celescope.rna', + 'analysis': 'celescope.rna', +} + # m: memory # x: thread RESOURCE = { diff --git a/celescope/dynaseq/analysis.py b/celescope/dynaseq/analysis.py deleted file mode 100755 index ad5ec8f8..00000000 --- a/celescope/dynaseq/analysis.py +++ /dev/null @@ -1,104 +0,0 @@ -import pandas as pd - -from celescope.tools.analysis_mixin import AnalysisMixin -from celescope.tools.step import Step -from celescope.tools.utils import add_log, get_id_name_dict, s_common - - -@add_log -def generate_matrix(gtf_file, matrix_file): - - id_name = get_id_name_dict(gtf_file) - matrix = pd.read_csv(matrix_file, sep="\t") - - gene_name_col = matrix.geneID.apply(lambda x: id_name[x]) - matrix.geneID = gene_name_col - matrix = matrix.drop_duplicates(subset=["geneID"], keep="first") - matrix = matrix.dropna() - matrix = matrix.rename({"geneID": ""}, axis='columns') - return matrix - - -class Analysis_dynaseq(Step, AnalysisMixin): - """ - Features - - Cell clustering with Seurat. - - - Calculate the marker gene of each cluster. - - - Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. - - Output - - `markers.tsv` Marker genes of each cluster. - - - `tsne_coord.tsv` t-SNE coordinates and clustering information. - - - `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` - parameter is provided. The result contains 3 files: - - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", - it means that the given marker is not enough to identify the cluster. - - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. - - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) - """ - def __init__(self, args, step_name): - Step.__init__(self, args, step_name) - AnalysisMixin.__init__(self, args) - self.matrix_file = args.matrix_file - self.genomeDir = args.genomeDir - self.type_marker_tsv = args.type_marker_tsv - self.auto_assign_bool = False - self.save_rds = args.save_rds - if args.type_marker_tsv and args.type_marker_tsv != 'None': - self.auto_assign_bool = True - self.save_rds = True - - def run(self): - self.seurat(self.matrix_file, self.save_rds, self.genomeDir) - if self.auto_assign_bool: - self.auto_assign(self.type_marker_tsv) - self.run_analysis() - self.add_data_item(cluster_tsne=self.cluster_tsne) - self.add_data_item(gene_tsne=self.gene_tsne) - self.add_data_item(table_dict=self.table_dict) - - self.clean_up() - - -@add_log -def analysis(args): - - step_name = "analysis" - ana = Analysis_dynaseq(args, step_name) - ana.run() - - -def get_opts_analysis(parser, sub_program): - - parser.add_argument('--genomeDir', help='Required. Genome directory.', required=True) - parser.add_argument('--save_rds', action='store_true', help='Write rds to disk.') - parser.add_argument( - '--type_marker_tsv', - help="""A tsv file with header. If this parameter is provided, cell type will be annotated. Example: -``` -cell_type marker -Alveolar "CLDN18,FOLR1,AQP4,PEBP4" -Endothelial "CLDN5,FLT1,CDH5,RAMP2" -Epithelial "CAPS,TMEM190,PIFO,SNTN" -Fibroblast "COL1A1,DCN,COL1A2,C1R" -B_cell "CD79A,IGKC,IGLC3,IGHG3" -Myeloid "LYZ,MARCO,FCGR3A" -T_cell "CD3D,TRBC1,TRBC2,TRAC" -LUAD "NKX2-1,NAPSA,EPCAM" -LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" -```""" - ) - if sub_program: - parser.add_argument( - '--matrix_file', - help='Required. Matrix_10X directory from step count.', - required=True, - ) - parser = s_common(parser) - - - diff --git a/celescope/dynaseq/star.py b/celescope/dynaseq/star.py deleted file mode 100755 index 525cfcf6..00000000 --- a/celescope/dynaseq/star.py +++ /dev/null @@ -1,163 +0,0 @@ -import subprocess - -import pandas as pd - -import celescope.tools.utils as utils -from celescope.__init__ import ROOT_PATH -from celescope.tools.star_mixin import StarMixin, get_opts_star_mixin -from celescope.tools.step import Step - - -class Star_dynaseq(Step, StarMixin): - """ - Features - - Align R2 reads to the reference genome with STAR. - - Collect Metrics with Picard. - - Output - - `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. - - - `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. - - - `{sample}_Log.out` Main log with a lot of detailed information about the run. - This is most useful for troubleshooting and debugging. - - - `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, - % of mapped reads etc. It is updated in 1 minute intervals. - - - `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, - very useful for quality control. The statistics are calculated for each read (single- or paired-end) and - then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, - (unlike the samtools agstat/idxstats, which count each mate separately). - Most of the information is collected about the UNIQUE mappers - (unlike samtools agstat/idxstats which does not separate unique or multi-mappers). - Each splicing is counted in the numbers of splices, which would correspond to - summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, - i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. - - - `{sample}_region.log` Picard CollectRnaSeqMetrics results. - """ - - def __init__(self, args, step_name): - Step.__init__(self, args, step_name) - StarMixin.__init__(self, args) - # parse - self.refflat = f"{self.genomeDir}/{self.genome['refflat']}" - - self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt' - self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log' - self.picard_region_log = f'{self.outdir}/{self.sample}_region.log' - self.plot = None - self.stats = pd.Series() - - def add_other_metrics(self): - """ - add picard region bases - add region plot - if debug, add ribosomal RNA reads percent - """ - - with open(self.picard_region_log, 'r') as picard_log: - region_dict = {} - for line in picard_log: - if not line: - break - if line.startswith('## METRICS CLASS'): - header = picard_log.readline().strip().split('\t') - data = picard_log.readline().strip().split('\t') - region_dict = dict(zip(header, data)) - break - - total = float(region_dict['PF_ALIGNED_BASES']) - exonic_regions = int(region_dict['UTR_BASES']) + \ - int(region_dict['CODING_BASES']) - intronic_regions = int(region_dict['INTRONIC_BASES']) - intergenic_regions = int(region_dict['INTERGENIC_BASES']) - - self.add_metric( - name='Base Pairs Mapped to Exonic Regions', - value=exonic_regions, - total=total, - ) - self.add_metric( - name='Base Pairs Mapped to Intronic Regions', - value=intronic_regions, - total=total, - ) - self.add_metric( - name='Base Pairs Mapped to Intergenic Regions', - value=intergenic_regions, - total=total, - ) - - # ribo - if self.debug: - with open(self.ribo_log, 'r') as ribo_log: - for line in ribo_log: - if line.find('#Matched') != -1: - items = line.split() - Reads_Mapped_to_rRNA = int(items[1]) - if line.find('#Total') != -1: - items = line.split() - Reads_Total = int(items[1]) - self.add_metric( - name=f'{self.stat_prefix} Mapped to rRNA', - value=Reads_Mapped_to_rRNA, - total=Reads_Total, - ) - - region_plot = {'region_labels': ['Exonic Regions', 'Intronic Regions', 'Intergenic Regions'], - 'region_values': [exonic_regions, intronic_regions, intergenic_regions]} - self.add_content_item("data", STAR_plot=region_plot) - - - @utils.add_log - def ribo(self): - human_ribo_fa = f'{ROOT_PATH}/data/rRNA/human_ribo.fasta' - self.ribo_log = f'{self.outdir}/{self.sample}_ribo_log.txt' - self.ribo_run_log = f'{self.outdir}/{self.sample}_ribo_run.log' - cmd = ( - f'bbduk.sh ' - f'in1={self.fq} ' - f'ref={human_ribo_fa} ' - f'stats={self.ribo_log} ' - f'overwrite=t ' - f'> {self.ribo_run_log} 2>&1 ' - ) - Star_dynaseq.ribo.logger.info(cmd) - subprocess.check_call(cmd, shell=True) - - @utils.add_log - def picard(self): - cmd = [ - 'picard', - '-Xmx20G', - '-XX:ParallelGCThreads=4', - 'CollectRnaSeqMetrics', - 'I=%s' % (self.STAR_bam), - 'O=%s' % (self.picard_region_log), - 'REF_FLAT=%s' % (self.refflat), - 'STRAND=NONE', - 'VALIDATION_STRINGENCY=SILENT'] - cmd_str = ' '.join(cmd) - Star_dynaseq.picard.logger.info(cmd_str) - subprocess.check_call(cmd) - - @utils.add_log - def run(self): - self.run_star() - self.picard() - if self.debug: - self.ribo() - self.add_other_metrics() - self.clean_up() - - -def star(args): - step_name = "star" - runner = Star_dynaseq(args, step_name) - runner.run() - - -def get_opts_star(parser, sub_program): - get_opts_star_mixin(parser, sub_program) diff --git a/tests/test_multi.py b/tests/test_multi.py index fe853db7..b1004f16 100755 --- a/tests/test_multi.py +++ b/tests/test_multi.py @@ -15,6 +15,7 @@ ASSAYS = [ 'capture_virus', 'snp', 'rna', + 'dynaseq', ] -- Gitee From f1b13b59da28ceda817cbfb4c92939828d9c947a Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 10:50:02 +0800 Subject: [PATCH 78/96] analysis fix --- celescope/rna/analysis.py | 10 +++---- docs/dynaseq/analysis.md | 51 ----------------------------------- docs/dynaseq/star.md | 56 --------------------------------------- docs/rna/analysis.md | 10 +++---- 4 files changed, 10 insertions(+), 117 deletions(-) delete mode 100644 docs/dynaseq/analysis.md delete mode 100644 docs/dynaseq/star.md diff --git a/celescope/rna/analysis.py b/celescope/rna/analysis.py index 8045e329..50cc4e58 100755 --- a/celescope/rna/analysis.py +++ b/celescope/rna/analysis.py @@ -1,14 +1,14 @@ import pandas as pd from celescope.tools.analysis_mixin import AnalysisMixin -from celescope.tools.step import Step -from celescope.tools.utils import add_log, get_id_name_dict, s_common +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils -@add_log +@utils.add_log def generate_matrix(gtf_file, matrix_file): - id_name = get_id_name_dict(gtf_file) + id_name = utils.get_id_name_dict(gtf_file) matrix = pd.read_csv(matrix_file, sep="\t") gene_name_col = matrix.geneID.apply(lambda x: id_name[x]) @@ -65,7 +65,7 @@ class Analysis_rna(Step, AnalysisMixin): self.clean_up() -@add_log +@utils.add_log def analysis(args): step_name = "analysis" diff --git a/docs/dynaseq/analysis.md b/docs/dynaseq/analysis.md deleted file mode 100644 index d2a38bc9..00000000 --- a/docs/dynaseq/analysis.md +++ /dev/null @@ -1,51 +0,0 @@ -## Features -- Cell clustering with Seurat. - -- Calculate the marker gene of each cluster. - -- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. - -## Output -- `markers.tsv` Marker genes of each cluster. - -- `tsne_coord.tsv` t-SNE coordinates and clustering information. - -- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` -parameter is provided. The result contains 3 files: - - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", -it means that the given marker is not enough to identify the cluster. - - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. - - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--save_rds` Write rds to disk. - -`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: -``` -cell_type marker -Alveolar "CLDN18,FOLR1,AQP4,PEBP4" -Endothelial "CLDN5,FLT1,CDH5,RAMP2" -Epithelial "CAPS,TMEM190,PIFO,SNTN" -Fibroblast "COL1A1,DCN,COL1A2,C1R" -B_cell "CD79A,IGKC,IGLC3,IGHG3" -Myeloid "LYZ,MARCO,FCGR3A" -T_cell "CD3D,TRBC1,TRBC2,TRAC" -LUAD "NKX2-1,NAPSA,EPCAM" -LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" -``` - -`--matrix_file` Required. Matrix_10X directory from step count. - -`--outdir` output dir - -`--assay` assay - -`--sample` sample name - -`--thread` None - -`--debug` debug - diff --git a/docs/dynaseq/star.md b/docs/dynaseq/star.md deleted file mode 100644 index ec3b5211..00000000 --- a/docs/dynaseq/star.md +++ /dev/null @@ -1,56 +0,0 @@ -## Features -- Align R2 reads to the reference genome with STAR. -- Collect Metrics with Picard. - -## Output -- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. - -- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. - -- `{sample}_Log.out` Main log with a lot of detailed information about the run. -This is most useful for troubleshooting and debugging. - -- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, -% of mapped reads etc. It is updated in 1 minute intervals. - -- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, -very useful for quality control. The statistics are calculated for each read (single- or paired-end) and -then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, -(unlike the samtools agstat/idxstats, which count each mate separately). -Most of the information is collected about the UNIQUE mappers -(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). -Each splicing is counted in the numbers of splices, which would correspond to -summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, -i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. - -- `{sample}_region.log` Picard CollectRnaSeqMetrics results. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads - -`--STAR_param` Other STAR parameters - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md index d2a38bc9..9ddfd1b3 100644 --- a/docs/rna/analysis.md +++ b/docs/rna/analysis.md @@ -39,13 +39,13 @@ LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" `--matrix_file` Required. Matrix_10X directory from step count. -`--outdir` output dir +`--outdir` Output diretory. -`--assay` assay +`--assay` Assay name. -`--sample` sample name +`--sample` Sample name. -`--thread` None +`--thread` Thread to use. -`--debug` debug +`--debug` If this argument is used, celescope may output addtional file for debugging. -- Gitee From f778ab0483c0629f18e5df20991f1044f296d31c Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 24 Jun 2021 11:37:19 +0800 Subject: [PATCH 79/96] add assembly length ang clean fq --- celescope/trust_vdj/res_filter.py | 29 +++++++++++--- celescope/trust_vdj/trust_assemble.py | 58 ++++++++++++++++++++++----- 2 files changed, 71 insertions(+), 16 deletions(-) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index 883ae849..ed7a2720 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -4,10 +4,24 @@ from celescope.tools import utils from collections import defaultdict from celescope.tools.cellranger3 import get_plot_elements import numpy as np +import pysam + + +def get_len(fa): + with pysam.FastaFile(fa) as fh: + res = {} + names = fh.references + lengths = fh.lengths + res['contig_id'] = names + res['length'] = lengths + + df = pd.DataFrame(res, columns=list(res.keys())) + return df @utils.add_log -def beauty_report(barcode_report): +def beauty_report(barcode_report, fa): + df_len = get_len(fa) df = pd.read_csv(barcode_report, sep='\t') rows = df.shape[0] chains = ['chain2', 'chain1'] @@ -16,7 +30,7 @@ def beauty_report(barcode_report): for l in range(len(chains)): chain = chains[l] - items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'full_length_assembly': -1} + items = {'V': 0, 'D': 1, 'J': 2, 'C': 3, 'CDR3nt': 4, 'CDR3aa': 5, 'readcount': 6, 'contig_id': -3, 'full_length_assembly': -1} for i in range(rows): cb = df.loc[i, '#barcode'] @@ -33,9 +47,11 @@ def beauty_report(barcode_report): res = pd.DataFrame(dic, columns=list(dic.keys())) - return res + df_res = pd.merge(res, df_len, on='contig_id', how='inner') + return df_res +@utils.add_log def get_clone_table(df, Seqtype): res_filter_summary = [] @@ -46,7 +62,7 @@ def get_clone_table(df, Seqtype): paired_groups = ['TRA_TRB'] if Seqtype == 'BCR': chains = ['IGH', 'IGL', 'IGK'] - paired_groups = ['IGH_IHL', 'IGH_IGK'] + paired_groups = ['IGH_IGL', 'IGH_IGK'] for chain in chains: tmp = df[df['V'].str.contains(chain, na=False)] tmp = tmp.set_index('barcode') @@ -126,11 +142,12 @@ class Res_filter(Step): @utils.add_log def run(self): barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' - df = beauty_report(barcode_report) + fa = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_annot.fa' + df = beauty_report(barcode_report, fa) if self.full_length: df = df[df['full_length_assembly']=='1'] - df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t') + df.to_csv(f'{self.outdir}/{self.sample}_barcode_report.tsv', sep='\t', index=False) clones, res_filter_summary = get_clone_table(df, self.Seqtype) diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/trust_assemble.py index e052d0ae..8cf8f76a 100644 --- a/celescope/trust_vdj/trust_assemble.py +++ b/celescope/trust_vdj/trust_assemble.py @@ -47,6 +47,21 @@ def match_barcodes(outdir, match_dir, Seqtype, fq1): seqlist.write(str(name) + '\n') +def clean_fq(fq1, fq2, outdir, sample, species): + + prefix = f'{outdir}/{sample}_clean' + + cmd = ( + f'/SGRNJ03/randd/zhouxin/software/TRUST4/fastq-extractor ' + f'-t 10 -f /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa ' + f'-o {prefix} --barcodeStart 0 --barcodeEnd 23 ' + f'-u {fq2} ' + f'--barcode {fq1}' + ) + + os.system(cmd) + + def mapping_summary(outdir, Seqtype, fq, species): stat_file = outdir + '/stat.txt' @@ -93,15 +108,32 @@ def mapping_summary(outdir, Seqtype, fq, species): 'count': count, 'total_count': total_count, }) + # os.system(f'rm {outdir}/{locus}.sam') - os.system(f'rm {outdir}/{locus}.sam') - - trust_assemble_summary.insert(0, { - 'item': stat_string, - 'count': total_mapped, - 'total_count': total_count - }) - + # total mapping + cmd = ( + f'source activate bracer; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} ' + f'-U {fq} ' + f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + with open(f'{outdir}/log') as fh: + for line in fh: + if 'reads; of these:' in line: + attr = re.findall(r'\d+', line) + total_count = int(attr[0]) + if 'aligned exactly 1 time' in line: + res = re.findall(r"\d+", line) + count = int(res[0]) + trust_assemble_summary.insert(0, { + 'item': stat_string, + 'count': count, + 'total_count': total_count, + }) + + os.system(f'rm {outdir}/*.sam') os.system(f'rm {outdir}/log') df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count']) @@ -147,7 +179,8 @@ class Trust_assemble(Step): @utils.add_log def run(self): - self.getFqfile() + if not os.path.exists(f'{self.outdir}/{self.sample}_matched_R2.fq'): + self.getFqfile() species = self.species @@ -175,7 +208,12 @@ class Trust_assemble(Step): #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' - mapping_summary(self.outdir, self.Seqtype, self.fq2, species) + # report + clean_fq(self.fq1, self.fq2, self.outdir, self.sample, species) + + fq = f'{self.outdir}/{self.sample}_clean.fq' + + mapping_summary(self.outdir, self.Seqtype, fq, species) os.remove(f'{self.outdir}/seqlist.txt') -- Gitee From 7444c8e3edf6d4f6fd050713e6de555f51d8156b Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Thu, 24 Jun 2021 11:39:26 +0800 Subject: [PATCH 80/96] add bowtie2 and plotly==4.14.3 --- conda_pkgs.txt | 3 ++- setup.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/conda_pkgs.txt b/conda_pkgs.txt index 22f5fab0..99567356 100755 --- a/conda_pkgs.txt +++ b/conda_pkgs.txt @@ -11,4 +11,5 @@ r-tidyverse mixcr=3.0.3 bioconductor-dropletutils bcftools==1.9 -seqkt \ No newline at end of file +seqkt +bowtie2 \ No newline at end of file diff --git a/setup.py b/setup.py index f6256a90..a23429ce 100755 --- a/setup.py +++ b/setup.py @@ -41,6 +41,6 @@ setuptools.setup( 'editdistance>=0.5.3', 'mutract', 'sklearn', - 'plotly', + 'plotly==4.14.3', ] ) -- Gitee From a2b5d20e810b55a864f29542b388e4e33ca2b034 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 16:40:58 +0800 Subject: [PATCH 81/96] add scripts --- celescope/scripts/gene_umi_summary.R | 28 +++++++ celescope/scripts/mt_summary.py | 106 +++++++++++++++++++++++++++ 2 files changed, 134 insertions(+) create mode 100644 celescope/scripts/gene_umi_summary.R create mode 100644 celescope/scripts/mt_summary.py diff --git a/celescope/scripts/gene_umi_summary.R b/celescope/scripts/gene_umi_summary.R new file mode 100644 index 00000000..60e93751 --- /dev/null +++ b/celescope/scripts/gene_umi_summary.R @@ -0,0 +1,28 @@ +library(Seurat) +library(tidyverse) +library(argparser) + +argv <- arg_parser('') +argv <- add_argument(argv,"--matrix_dir", help="") +argv <- add_argument(argv,"--outdir", help="") +argv <- add_argument(argv,"--sample", help="") +argv <- add_argument(argv,"--mt_gene_list_file", help="") +argv <- parse_args(argv) + +matrix_dir = argv$matrix_dir +outdir = argv$outdir +sample = argv$sample +mt_gene_list_file = argv$mt_gene_list_file + +# out +df.out = str_glue("{outdir}/{sample}_MT_UMI.tsv") + +mtx = Read10X(matrix_dir) +mt_gene_list = read.table(mt_gene_list_file)[,1] + +gene_valid = rownames(mtx) +gene_intersect = intersect(gene_valid, mt_gene_list) +cells = dim(mtx)[2] +mean_UMI = sort(round(rowSums(mtx[gene_intersect,]) / cells,3), decreasing = T) +df = as.data.frame(mean_UMI) +write.table(df, df.out, sep='\t', col.names=NA) diff --git a/celescope/scripts/mt_summary.py b/celescope/scripts/mt_summary.py new file mode 100644 index 00000000..20019d02 --- /dev/null +++ b/celescope/scripts/mt_summary.py @@ -0,0 +1,106 @@ +import argparse +import glob +import os +import subprocess + +import pandas as pd +from plotnine import ggplot, aes, geom_line + +from celescope.celescope import ArgFormatter +from celescope.__init__ import HELP_DICT, ROOT_PATH +from celescope.rna.mkref import parse_genomeDir_rna +import celescope.tools.utils as utils + +SAMPLE_COL_INDEX = 2 + +def parse_mapfile(mapfile): + sample_set = set() + df_mapfile = pd.read_csv(mapfile, sep='\t', header=None) + + def read_row(row): + sample = row[SAMPLE_COL_INDEX] + sample_set.add(sample) + + df_mapfile.apply(read_row, axis=1) + return sample_set + + +class Mt_summary(): + def __init__(self, sample, outdir, genomeDir): + self.sample = sample + self.outdir = outdir + + # set + self.mt_gene_list_file = parse_genomeDir_rna(genomeDir)['mt_gene_list'] + self.featureCounts_bam = None + try: + self.featureCounts_bam = glob.glob(f'{sample}/*featureCounts/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam')[0] + except IndexError: + print("featureCounts bam does not exist! Skip coverage summary.") + + self.matrix_dir = glob.glob(f'{sample}/*count/{sample}_matrix_10X')[0] + + # out + if not os.path.exists(outdir): + os.system(f'mkdir -p {outdir}') + out_prefix = f'{outdir}/{sample}' + self.mt_bam = f'{out_prefix}_mt.bam' + self.mt_depth = f'{out_prefix}_mt_depth.tsv' + self.coverage_plot = f'{out_prefix}_mt_coverage.png' + + @utils.add_log + def samtools(self): + cmd = ( + f'samtools index {self.featureCounts_bam};' + f'samtools view -b {self.featureCounts_bam} MT -o {self.mt_bam};' + f'samtools depth -a {self.mt_bam} > {self.mt_depth}' + ) + self.samtools.logger.info(cmd) + subprocess.check_call(cmd, shell=True) + + @utils.add_log + def umi_summary(self): + cmd = ( + f'Rscript {ROOT_PATH}/scripts/gene_umi_summary.R ' + f'--sample {self.sample} ' + f'--outdir {self.outdir} ' + f'--mt_gene_list_file {self.mt_gene_list_file} ' + f'--matrix_dir {self.matrix_dir} ' + ) + self.umi_summary.logger.info(cmd) + subprocess.check_call(cmd, shell=True) + + @utils.add_log + def coverage_summary(self): + self.samtools() + df = pd.read_csv(self.mt_depth, sep='\t', header=None) + df.columns = ["MT", "position", "read_count"] + plot = ggplot(df, aes(x="position",y="read_count")) + geom_line() + plot.save(self.coverage_plot) + + @utils.add_log + def run(self): + if self.featureCounts_bam: + self.umi_summary() + self.coverage_summary() + + +def main(): + parser = argparse.ArgumentParser(description='plot snp', formatter_class=ArgFormatter) + parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True) + parser.add_argument("--genomeDir", help=HELP_DICT["genomeDir"], + default='/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92') + parser.add_argument("--outdir", help="output dir", default='mt_summary') + args = parser.parse_args() + + sample_set = parse_mapfile(args.mapfile) + for sample in sample_set: + runner = Mt_summary( + sample=sample, + outdir=args.outdir, + genomeDir=args.genomeDir, + ) + runner.run() + +if __name__ == '__main__': + main() \ No newline at end of file -- Gitee From 10fe5f9e3e500704e7c1241c30ff4535f6c42709 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Thu, 24 Jun 2021 17:57:49 +0800 Subject: [PATCH 82/96] mean read per cell --- celescope/scripts/gene_umi_summary.R | 4 ++-- celescope/scripts/mt_summary.py | 27 ++++++++++++++++----------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/celescope/scripts/gene_umi_summary.R b/celescope/scripts/gene_umi_summary.R index 60e93751..d6f2a33c 100644 --- a/celescope/scripts/gene_umi_summary.R +++ b/celescope/scripts/gene_umi_summary.R @@ -15,7 +15,7 @@ sample = argv$sample mt_gene_list_file = argv$mt_gene_list_file # out -df.out = str_glue("{outdir}/{sample}_MT_UMI.tsv") +df.out = str_glue("{outdir}/{sample}_mt_UMI.tsv") mtx = Read10X(matrix_dir) mt_gene_list = read.table(mt_gene_list_file)[,1] @@ -25,4 +25,4 @@ gene_intersect = intersect(gene_valid, mt_gene_list) cells = dim(mtx)[2] mean_UMI = sort(round(rowSums(mtx[gene_intersect,]) / cells,3), decreasing = T) df = as.data.frame(mean_UMI) -write.table(df, df.out, sep='\t', col.names=NA) +write.table(df, df.out, sep='\t', col.names=NA, quote = F) diff --git a/celescope/scripts/mt_summary.py b/celescope/scripts/mt_summary.py index 20019d02..fd5b8e67 100644 --- a/celescope/scripts/mt_summary.py +++ b/celescope/scripts/mt_summary.py @@ -26,19 +26,22 @@ def parse_mapfile(mapfile): class Mt_summary(): - def __init__(self, sample, outdir, genomeDir): + def __init__(self, sample, outdir, genomeDir, root_dir): self.sample = sample self.outdir = outdir # set + match_dir = f'{root_dir}/{sample}' self.mt_gene_list_file = parse_genomeDir_rna(genomeDir)['mt_gene_list'] - self.featureCounts_bam = None + _barcodes, self.ncell = utils.read_barcode_file(match_dir) + self.bam = None try: - self.featureCounts_bam = glob.glob(f'{sample}/*featureCounts/{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam')[0] + self.bam = glob.glob( + f'{match_dir}/03*/{sample}*sortedByCoord.out.bam')[0] except IndexError: - print("featureCounts bam does not exist! Skip coverage summary.") + print("STAR bam does not exist! Skip coverage summary.") - self.matrix_dir = glob.glob(f'{sample}/*count/{sample}_matrix_10X')[0] + self.matrix_dir = glob.glob(f'{match_dir}/*count/{sample}_matrix_10X')[0] # out if not os.path.exists(outdir): @@ -51,8 +54,7 @@ class Mt_summary(): @utils.add_log def samtools(self): cmd = ( - f'samtools index {self.featureCounts_bam};' - f'samtools view -b {self.featureCounts_bam} MT -o {self.mt_bam};' + f'samtools view -b {self.bam} MT -o {self.mt_bam};' f'samtools depth -a {self.mt_bam} > {self.mt_depth}' ) self.samtools.logger.info(cmd) @@ -75,14 +77,15 @@ class Mt_summary(): self.samtools() df = pd.read_csv(self.mt_depth, sep='\t', header=None) df.columns = ["MT", "position", "read_count"] - plot = ggplot(df, aes(x="position",y="read_count")) + geom_line() + df["mean_read_count_per_cell"] = df["read_count"].apply(lambda x: x / self.ncell) + plot = ggplot(df, aes(x="position", y="mean_read_count_per_cell")) + geom_line() plot.save(self.coverage_plot) @utils.add_log def run(self): - if self.featureCounts_bam: - self.umi_summary() - self.coverage_summary() + self.umi_summary() + if self.bam: + self.coverage_summary() def main(): @@ -90,6 +93,7 @@ def main(): parser.add_argument("--mapfile", help="mapfile with VIDs as 5th column", required=True) parser.add_argument("--genomeDir", help=HELP_DICT["genomeDir"], default='/SGRNJ/Public/Database/genome/homo_sapiens/ensembl_92') + parser.add_argument("--root_dir", help='input root_dir', default='./') parser.add_argument("--outdir", help="output dir", default='mt_summary') args = parser.parse_args() @@ -99,6 +103,7 @@ def main(): sample=sample, outdir=args.outdir, genomeDir=args.genomeDir, + root_dir=args.root_dir, ) runner.run() -- Gitee From 1e3feab042f2a6948de7f52841c9ab38ccfc3b26 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 09:57:59 +0800 Subject: [PATCH 83/96] update --- celescope/__init__.py | 2 ++ celescope/snp/mkref.py | 19 +++++++++++++++--- celescope/tools/multi.py | 43 +++++++++++++++++++++------------------- celescope/tools/step.py | 6 +++--- docs/snp/mkref.md | 8 ++++++++ generate_docs.py | 4 ++-- 6 files changed, 54 insertions(+), 28 deletions(-) diff --git a/celescope/__init__.py b/celescope/__init__.py index d71b3352..e89d5496 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -27,4 +27,6 @@ HELP_DICT = { 'match_dir': 'Match celescope scRNA-Seq directory.', 'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.', 'genomeDir': 'Genome directory after running `mkref`.', + 'thread': 'Thread to use.', + 'debug': 'If this argument is used, celescope may output addtional file for debugging.', } diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py index b5632a9f..32fd2dd0 100644 --- a/celescope/snp/mkref.py +++ b/celescope/snp/mkref.py @@ -9,9 +9,22 @@ from celescope.tools.mkref import get_opts_mkref as opts class Mkref_snp(Mkref): """ - https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format - Create dictionary file and fasta index for gatk SplitNCigarReads. - Need to build on top of a rna genome. + Features + - Create dictionary file and fasta index for gatk SplitNCigarReads. + (https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) + Need to run `celescope rna mkref` first + + Output + - fasta index + - gatk dictionary file + + Usage + ``` + # run celescope rna mkref first + celescope snp mkref \ + --genome_name Homo_sapiens_ensembl_99 \ + --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa + ``` """ def __init__(self, genome_type, args): diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index 8b3351ff..4df0d7e8 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -7,6 +7,7 @@ from collections import defaultdict import celescope import celescope.tools.utils as utils from celescope.celescope import ArgFormatter +from celescope.__init__ import HELP_DICT TOOLS_DIR = os.path.dirname(celescope.tools.__file__) @@ -21,7 +22,6 @@ class Multi(): self.__APP__ = 'celescope' self.col4_default = None self.last_step = '' - self.args = None self.steps_not_run = ['mkref'] # remove @@ -42,7 +42,20 @@ class Multi(): elif self.args.steps_run: self.steps_run = self.args.steps_run.strip().split(',') - self.prepare() + # init + self.fq_dict = {} + self.col4_dict = {} + self.col5_dict = {} + self.logdir = self.args.outdir + '/log' + + # script init + self.sjm_cmd = f'log_dir {self.logdir}\n' + self.sjm_order = '' + self.shell_dict = defaultdict(str) + + # outdir dict + self.outdir_dic = {} + def common_args(self): readme = f'{self.__ASSAY__} multi-samples' @@ -57,15 +70,15 @@ class Multi(): 1st col: LibName; 2nd col: DataDir; 3rd col: SampleName; - 4th col: Cell number or match_dir, optional; + 4th col: optional; ''', required=True) parser.add_argument('--rm_files', action='store_true', help='remove redundant fq.gz and bam after running') - parser.add_argument('--steps_run', help='steps to run', default='all') + parser.add_argument('--steps_run', help='Steps to run. Multiple Steps are separated by comma.', default='all') # sub_program parser do not have - parser.add_argument('--outdir', help='output dir', default="./") - parser.add_argument('--debug', help='debug or not', action='store_true') - parser.add_argument('--thread', help='thread', default=4) + parser.add_argument('--outdir', help='Output directory.', default="./") + parser.add_argument('--thread', help=HELP_DICT['thread'], default=4) + parser.add_argument('--debug', help=HELP_DICT['debug'], action='store_true') self.parser = parser return parser @@ -122,26 +135,15 @@ class Multi(): def prepare(self): """ - parse_mapfile, link data, make log dir, init script variables, init outdir_dic + parse_mapfile, make log dir, init script variables, init outdir_dic """ # parse_mapfile self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default) - # link - self.link_data() - # mk log dir - self.logdir = self.args.outdir + '/log' if self.args.mod == 'sjm': os.system('mkdir -p %s' % (self.logdir)) - # script init - self.sjm_cmd = 'log_dir %s\n' % (self.logdir) - self.sjm_order = '' - self.shell_dict = defaultdict(str) - - # outdir dict - self.outdir_dic = {} for sample in self.fq_dict: self.outdir_dic[sample] = {} index = 0 @@ -190,7 +192,7 @@ job_end f'--thread {self.args.thread} ' ) cmd_line = step_prefix - if self.args.debug or self.__CONDA__ == "celescope_RD": + if self.args.debug: cmd_line += " --debug " for arg in args_dict: if args_dict[arg] is False: @@ -335,6 +337,7 @@ job_end f.write(self.shell_dict[sample]) def run(self): + self.prepare() self.run_steps() self.end() diff --git a/celescope/tools/step.py b/celescope/tools/step.py index 39e75b59..7f3224b6 100755 --- a/celescope/tools/step.py +++ b/celescope/tools/step.py @@ -10,6 +10,7 @@ import pandas as pd from jinja2 import Environment, FileSystemLoader, select_autoescape from celescope.tools.utils import add_log +from celescope.__init__ import HELP_DICT Metric = namedtuple("Metric", "name value total fraction") @@ -21,9 +22,8 @@ def s_common(parser): parser.add_argument('--outdir', help='Output diretory.', required=True) parser.add_argument('--assay', help='Assay name.', required=True) parser.add_argument('--sample', help='Sample name.', required=True) - parser.add_argument('--thread', help='Thread to use.', default=4) - parser.add_argument( - '--debug', help='If this argument is used, celescope may output addtional file for debugging.', action='store_true') + parser.add_argument('--thread', help=HELP_DICT['thread'], default=4) + parser.add_argument('--debug', help=HELP_DICT['debug'], action='store_true') return parser diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md index 7f12cc92..b78d34f8 100644 --- a/docs/snp/mkref.md +++ b/docs/snp/mkref.md @@ -1,3 +1,11 @@ +## Features +- Create dictionary file and fasta index for gatk SplitNCigarReads. +(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) +Need to run `celescope rna mkref` first + +## Output +- fasta index +- gatk dictionary file ## Arguments diff --git a/generate_docs.py b/generate_docs.py index fc6b0845..a43105a0 100644 --- a/generate_docs.py +++ b/generate_docs.py @@ -36,7 +36,7 @@ def generate_single_step_doc(assay, step): def get_argument_docs(func_opts): argument_docs = "" - parser = argparse.ArgumentParser(description='CeleScope',formatter_class=ArgFormatter) + parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter) func_opts(parser, sub_program=True) for argument in parser._option_string_actions: if not argument in ['-h', '--help']: @@ -49,7 +49,7 @@ def get_argument_docs(func_opts): def get_class_docs(step_module): - titles = ("Features", "Output") + titles = ("Features", "Output", "Usage") class_docs = "" for child in inspect.getmembers(step_module, inspect.isclass): """Filter out class not defined in step_module""" -- Gitee From 1c720f00e15377f4161a8b6531d8bbfc0dc4443d Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 11:21:18 +0800 Subject: [PATCH 84/96] vid single int --- celescope/snp/utils/plot_vid.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/celescope/snp/utils/plot_vid.py b/celescope/snp/utils/plot_vid.py index 5211d01a..20efb76f 100644 --- a/celescope/snp/utils/plot_vid.py +++ b/celescope/snp/utils/plot_vid.py @@ -24,7 +24,7 @@ def parse_mapfile(mapfile): def read_row(row): sample = row[SAMPLE_COL_INDEX] match_dir = row[MATCH_DIR_COL_INDEX] - vid_list = [int(vid) for vid in row[VID_COL_INDEX].strip().split(',')] + vid_list = [int(vid) for vid in str(row[VID_COL_INDEX]).strip().split(',')] sample_vid_dict[sample] = vid_list sample_match_dir_dict[sample] = match_dir -- Gitee From 74ecaa0f3a33dc11dfe193ca99d738a29dd84b19 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 13:14:56 +0800 Subject: [PATCH 85/96] refactor generate docs --- docs/CHANGELOG.md | 178 ------------------- docs/CONTRIBUTING.md | 106 ----------- docs/capture_rna/analysis.md | 21 --- docs/capture_rna/count_capture_rna.md | 61 ------- docs/capture_rna/featureCounts.md | 19 -- docs/capture_rna/sample.md | 17 -- docs/capture_rna/star.md | 56 ------ docs/capture_virus/analysis_capture_virus.md | 19 -- docs/capture_virus/consensus.md | 19 -- docs/capture_virus/count_capture_virus.md | 19 -- docs/capture_virus/mkref.md | 15 -- docs/capture_virus/sample.md | 17 -- docs/capture_virus/star_virus.md | 32 ---- docs/citeseq/analysis_cite.md | 13 -- docs/citeseq/count_cite.md | 13 -- docs/citeseq/mapping_tag.md | 21 --- docs/citeseq/sample.md | 17 -- docs/dynaseq/conversion.md | 26 --- docs/dynaseq/replace_tsne.md | 30 ---- docs/dynaseq/replacement.md | 36 ---- docs/dynaseq/subsitution.md | 20 --- docs/fusion/count_fusion.md | 23 --- docs/fusion/mkref.md | 24 --- docs/fusion/sample.md | 17 -- docs/fusion/star_fusion.md | 32 ---- docs/hla/mapping_hla.md | 15 -- docs/hla/sample.md | 17 -- docs/manual.md | 38 ---- docs/manual_template.md | 23 --- docs/methods/rna.txt | 10 -- docs/mut/count_mut.md | 17 -- docs/mut/mapping_mut.md | 17 -- docs/mut/sample.md | 17 -- docs/quick_start.md | 110 ------------ docs/rna/analysis.md | 51 ------ docs/rna/mkref.md | 38 ---- docs/rna/star.md | 56 ------ docs/rna_virus/analysis_rna_virus.md | 17 -- docs/rna_virus/count.md | 27 --- docs/rna_virus/count_virus.md | 17 -- docs/rna_virus/featureCounts.md | 19 -- docs/rna_virus/sample.md | 17 -- docs/rna_virus/star.md | 56 ------ docs/rna_virus/star_virus.md | 32 ---- docs/snp/analysis_snp.md | 23 --- docs/snp/mkref.md | 21 --- docs/snp/variant_calling.md | 38 ---- docs/tag/analysis_tag.md | 19 -- docs/tag/count_tag.md | 44 ----- docs/tag/mapping_tag.md | 48 ----- docs/tag/split_tag.md | 26 --- docs/tcr_fl/assemble.md | 15 -- docs/tcr_fl/sample.md | 17 -- docs/tcr_fl/split_fq.md | 15 -- docs/tools/barcode.md | 61 ------- docs/tools/consensus.md | 24 --- docs/tools/count.md | 61 ------- docs/tools/cutadapt.md | 44 ----- docs/tools/featureCounts.md | 38 ---- docs/tools/sample.md | 17 -- docs/tools/target_metrics.md | 28 --- docs/vdj/count_vdj.md | 37 ---- docs/vdj/mapping_vdj.md | 35 ---- 63 files changed, 2056 deletions(-) delete mode 100755 docs/CHANGELOG.md delete mode 100644 docs/CONTRIBUTING.md delete mode 100644 docs/capture_rna/analysis.md delete mode 100644 docs/capture_rna/count_capture_rna.md delete mode 100644 docs/capture_rna/featureCounts.md delete mode 100644 docs/capture_rna/sample.md delete mode 100644 docs/capture_rna/star.md delete mode 100644 docs/capture_virus/analysis_capture_virus.md delete mode 100644 docs/capture_virus/consensus.md delete mode 100644 docs/capture_virus/count_capture_virus.md delete mode 100644 docs/capture_virus/mkref.md delete mode 100644 docs/capture_virus/sample.md delete mode 100644 docs/capture_virus/star_virus.md delete mode 100644 docs/citeseq/analysis_cite.md delete mode 100644 docs/citeseq/count_cite.md delete mode 100644 docs/citeseq/mapping_tag.md delete mode 100644 docs/citeseq/sample.md delete mode 100644 docs/dynaseq/conversion.md delete mode 100644 docs/dynaseq/replace_tsne.md delete mode 100644 docs/dynaseq/replacement.md delete mode 100644 docs/dynaseq/subsitution.md delete mode 100644 docs/fusion/count_fusion.md delete mode 100644 docs/fusion/mkref.md delete mode 100644 docs/fusion/sample.md delete mode 100644 docs/fusion/star_fusion.md delete mode 100644 docs/hla/mapping_hla.md delete mode 100644 docs/hla/sample.md delete mode 100755 docs/manual.md delete mode 100644 docs/manual_template.md delete mode 100755 docs/methods/rna.txt delete mode 100644 docs/mut/count_mut.md delete mode 100644 docs/mut/mapping_mut.md delete mode 100644 docs/mut/sample.md delete mode 100755 docs/quick_start.md delete mode 100644 docs/rna/analysis.md delete mode 100644 docs/rna/mkref.md delete mode 100644 docs/rna/star.md delete mode 100644 docs/rna_virus/analysis_rna_virus.md delete mode 100644 docs/rna_virus/count.md delete mode 100644 docs/rna_virus/count_virus.md delete mode 100644 docs/rna_virus/featureCounts.md delete mode 100644 docs/rna_virus/sample.md delete mode 100644 docs/rna_virus/star.md delete mode 100644 docs/rna_virus/star_virus.md delete mode 100644 docs/snp/analysis_snp.md delete mode 100644 docs/snp/mkref.md delete mode 100644 docs/snp/variant_calling.md delete mode 100644 docs/tag/analysis_tag.md delete mode 100644 docs/tag/count_tag.md delete mode 100644 docs/tag/mapping_tag.md delete mode 100644 docs/tag/split_tag.md delete mode 100644 docs/tcr_fl/assemble.md delete mode 100644 docs/tcr_fl/sample.md delete mode 100644 docs/tcr_fl/split_fq.md delete mode 100644 docs/tools/barcode.md delete mode 100644 docs/tools/consensus.md delete mode 100644 docs/tools/count.md delete mode 100644 docs/tools/cutadapt.md delete mode 100644 docs/tools/featureCounts.md delete mode 100644 docs/tools/sample.md delete mode 100644 docs/tools/target_metrics.md delete mode 100644 docs/vdj/count_vdj.md delete mode 100644 docs/vdj/mapping_vdj.md diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md deleted file mode 100755 index 109dd6d7..00000000 --- a/docs/CHANGELOG.md +++ /dev/null @@ -1,178 +0,0 @@ -# Change Log - -## [unreleased] - 2021-06-09 -### Added - -### Changed - -### Fixed -- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`. - -### Removed -- `celescope.tools.utils.glob_genomeDir` - -## [1.3.1] - 2021-06-09 -### Added - -- Add wdl workflow. - -- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549. - -### Changed - -- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered) - -### Fixed and Removed - -- Remove h5 file generation in R to avoid memory issues. - - -## [1.3.0] - 2021-05-28 - -### Added - -- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details. - -### Changed - -- Change the way to handle duplicate gene_name and gene_id in gtf file. - -Previous: - - - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. - - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name. - - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name. - -Now: - - - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. - - one gene_id with multiple gene_name: error. - - duplicated (gene_name, gene_id): ignore duplicated records and print a warning. - -### Fixed - -- Fix `count tag` metrics order in merge.xls - -### Removed - -- Remove `--fusion_pos` from `celescope.fusion.count_fusion` - - -## [1.2.0] - 2021-05-19 - -### Added - -- Assay `rna` outputs .h5 file in 06.analysis directory. - -### Changed - -- Update Seurat from 2.3.4 to 4.0.1. - -- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding. - -- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136 - -### Removed - -- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory. - - -## [1.1.9] - 2021-04-25 - -### Added - -- Add parameter `--coefficient` to `celescope tag count_tag` and `multi_tag` - - Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)` - -- Add `.metrics.json` - -- Add `scopeV1` chemistry support. - -### Changed - -- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage). - -- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage. - -- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output. - -- Change the display of Barcode-rank plot in html report. - -### Fixed - -- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2. - -- Fix an error when Numpy >= 1.2.0. - -- VDJ merge.xls can display all the metrics correctly. - -### Removed - -- Remove fastqc from `barcode` step. - - -## [1.1.8] - 2021-03-26 - -### Added - -- Add read consensus to VDJ pipeline. - - A consensus step was added before mapping to merge all the reads of the same - (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`. - multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7. - -- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`. - - `--species` can be one of: - - `hs`: human - - `mmu`: mouse - -- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`. - - `--cell_calling_method` can be one of: - - `auto`: Same result as v1.1.7. - - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3. - - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points. - -- Add 4 tags to featureCounts bam. - - - `CB`: cell barcode - - `UB`: UMI - - `GN`: gene name - - `GX`: gene id - -- Add `--STAR_param` to `celescope rna STAR` - - Additional parameters of STAR can be passed into the `STAR` step. - -### Changed - -- One sample can have different chemistry fastq in mapfile. Version <= v1.1.7 will report this as an error. - -- Gtf file can be gzipped. - -- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` - -- Step `snpCalling` use mutract. - - -## [1.1.7] - 2020-12-16 - -### Added - -- Automatically detect Singleron chemistry version. - -### Changed - -- FeatureCounts use strand specificity. - -- Cutadapt default `overlap` change from `5` to `10`. - -- VDJ sort `NA` last. - -- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID. - - - - diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md deleted file mode 100644 index fb9ee986..00000000 --- a/docs/CONTRIBUTING.md +++ /dev/null @@ -1,106 +0,0 @@ -## Pull Requests -Create pull requests to `dev` branch - -## Lint code -Before pull requests, you should lint your code with the following command: -``` -pip install pylint -# lint -# W1618 (no-absolute-import) -# E1101 (no-member) -# W1633 (round-builtin) -# W1619 (old-division) -# W0105 (String statement has no effect) -# W0511 TODO! -# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type) -# W0212 Access to a protected member _option_string_actions of a client class (protected-access) -pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope -``` -Your code should be rated at 10(i.e. no error or warning). - -## Write a new step -When you add a new step, you need to - - Create a step class which inherit from `celescope.tools.step.Step`. - - Create a function with the same name of the module. The main function `celescope` uses this function to run each step. - - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface. - -For example, in `celescope.tools.cutadapt`: - -``` -from celescope.tools.step import Step, s_common -import celescope.tools.utils as utils - - -class Cutadapt(Step): - """ - Features - - Trim adapters in R2 reads with cutadapt. Default adapters includes: - - polyT=A{18}, 18 A bases. - - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. - - Output - - `cutadapt.log` Cutadapt output log file. - - `{sample}_clean_2.fq.gz` R2 reads file without adapters. - """ - - def __init__(self, args, step_name): - Step.__init__(self, args, step_name) - {some init code} - - @utils.add_log - def run(self): - {some code to run} - - -@utils.add_log -def cutadapt(args): - step_name = "cutadapt" - cutadapt_obj = Cutadapt(args, step_name) - cutadapt_obj.run() - - -def get_opts_cutadapt(parser, sub_program): - parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.') - parser.add_argument( - '--minimum_length', - help='Default `20`. Discard processed reads that are shorter than LENGTH.', - default=20 - ) - {other arguments} - if sub_program: - parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True) - parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true') - parser = s_common(parser) - return parser -``` - -## Docs -There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect: -- Docstring of the step class. The Docstring should have sections named `Features` and `Output`. -- Help infomation in `get_opts_{module_name}` - -Released assays will be added to `manual.md`. - -## Tests -If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example: - - -1. Get test data -``` -# If you have access to Singleron Nanjing HPC -copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir} -# Or clone from repo -git clone https://github.com/singleron-RD/celescope_tests.git -``` - -2. Run `pytest` -``` -Install pytest ->>> pip install pytest -Run all ->>> pytest -s ./tests/test_multi.py --test_dir {test_dir} -Run some tests ->>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag -``` - -Then you need to create your own test based on this example. \ No newline at end of file diff --git a/docs/capture_rna/analysis.md b/docs/capture_rna/analysis.md deleted file mode 100644 index 61a0265f..00000000 --- a/docs/capture_rna/analysis.md +++ /dev/null @@ -1,21 +0,0 @@ - - -## Arguments -`--outdir` output dir. - -`--assay` assay. - -`--sample` sample name. - -`--thread` None - -`--debug` debug. - -`--matrix_file` matrix file. - -`--genomeDir` genomeDir. - -`--save_rds` write rds to disk. - -`--type_marker_tsv` cell type marker tsv. - diff --git a/docs/capture_rna/count_capture_rna.md b/docs/capture_rna/count_capture_rna.md deleted file mode 100644 index 102eb020..00000000 --- a/docs/capture_rna/count_capture_rna.md +++ /dev/null @@ -1,61 +0,0 @@ -## Features -- Cell-calling: Distinguish cell barcodes from background barcodes. - -- Generate expression matrix. - -## Output -- `{sample}_all_matrix` The expression matrix of all detected barcodes. - Can be read in by calling the `Seurat::Read10X` function. - -- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. -Can be read in by calling the `Seurat::Read10X` function. - -- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. -CeleScope >=1.2.0 does not output this file. - -- `{sample}_count_detail.txt.gz` 4 columns: - - barcode - - gene ID - - UMI count - - read_count - -- `{sample}_counts.txt` 6 columns: - - Barcode: barcode sequence - - readcount: read count of each barcode - - UMI2: UMI count (with reads per UMI >= 2) for each barcode - - UMI: UMI count for each barcode - - geneID: gene count for each barcode - - mark: cell barcode or backgound barcode. - - `CB` cell - `UB` background - -- `{sample}_downsample.txt` 3 columns: - - percent: percentage of sampled reads - - median_geneNum: median gene number per cell - - saturation: sequencing saturation - -- `barcode_filter_magnitude.pdf` Barcode-UMI plot. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--expected_cell_num` Default `3000`. Expected cell number. - -`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--bam` Required. BAM file from featureCounts. - -`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%. - diff --git a/docs/capture_rna/featureCounts.md b/docs/capture_rna/featureCounts.md deleted file mode 100644 index 61e7f284..00000000 --- a/docs/capture_rna/featureCounts.md +++ /dev/null @@ -1,19 +0,0 @@ - - -## Arguments -`--gtf_type` Specify feature type in GTF annotation. - -`--genomeDir` None - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--input` None - diff --git a/docs/capture_rna/sample.md b/docs/capture_rna/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/capture_rna/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/capture_rna/star.md b/docs/capture_rna/star.md deleted file mode 100644 index c0d71407..00000000 --- a/docs/capture_rna/star.md +++ /dev/null @@ -1,56 +0,0 @@ -## Features -- Align R2 reads to the reference genome with STAR. -- Collect Metrics with Picard. - -## ## Output -- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. - -- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. - -- `{sample}_Log.out` Main log with a lot of detailed information about the run. -This is most useful for troubleshooting and debugging. - -- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, -% of mapped reads etc. It is updated in 1 minute intervals. - -- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, -very useful for quality control. The statistics are calculated for each read (single- or paired-end) and -then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, -(unlike the samtools agstat/idxstats, which count each mate separately). -Most of the information is collected about the UNIQUE mappers -(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). -Each splicing is counted in the numbers of splices, which would correspond to -summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, -i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. - -- `{sample}_region.log` Picard CollectRnaSeqMetrics results. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads. - -`--STAR_param` Other STAR parameters. - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/capture_virus/analysis_capture_virus.md b/docs/capture_virus/analysis_capture_virus.md deleted file mode 100644 index 01aeb0c1..00000000 --- a/docs/capture_virus/analysis_capture_virus.md +++ /dev/null @@ -1,19 +0,0 @@ - - -## Arguments -`--umi_threshold` method to find virus UMI threshold - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--match_dir` match_dir - -`--virus_file` virus UMI count file - diff --git a/docs/capture_virus/consensus.md b/docs/capture_virus/consensus.md deleted file mode 100644 index f09fdd07..00000000 --- a/docs/capture_virus/consensus.md +++ /dev/null @@ -1,19 +0,0 @@ - - -## Arguments -`--threshold` valid base threshold. - -`--not_consensus` input fastq is not consensus. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq` None - diff --git a/docs/capture_virus/count_capture_virus.md b/docs/capture_virus/count_capture_virus.md deleted file mode 100644 index 2789114d..00000000 --- a/docs/capture_virus/count_capture_virus.md +++ /dev/null @@ -1,19 +0,0 @@ - - -## Arguments -`--min_query_length` minimum query length - -`--outdir` output dir - -`--assay` assay - -`--sample` sample name - -`--thread` None - -`--debug` debug - -`--match_dir` matched rna_virus directory - -`--virus_bam` None - diff --git a/docs/capture_virus/mkref.md b/docs/capture_virus/mkref.md deleted file mode 100644 index a3627da9..00000000 --- a/docs/capture_virus/mkref.md +++ /dev/null @@ -1,15 +0,0 @@ - - -## Arguments -`--genomeDir` Default='./'. Output directory. - -`--thread` Default=6. Threads to use. - -`--genome_name` Required, genome name. - -`--dry_run` Only write config file and exit. - -`--fasta` virus fasta file - -`--genomeSAindexNbases` STAR genomeSAindexNbases - diff --git a/docs/capture_virus/sample.md b/docs/capture_virus/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/capture_virus/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/capture_virus/star_virus.md b/docs/capture_virus/star_virus.md deleted file mode 100644 index dba66990..00000000 --- a/docs/capture_virus/star_virus.md +++ /dev/null @@ -1,32 +0,0 @@ - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads. - -`--STAR_param` Other STAR parameters. - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--virus_genomeDir` virus genome dir. - diff --git a/docs/citeseq/analysis_cite.md b/docs/citeseq/analysis_cite.md deleted file mode 100644 index df9b21da..00000000 --- a/docs/citeseq/analysis_cite.md +++ /dev/null @@ -1,13 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--sample` sample name - -`--match_dir` match_dir - -`--citeseq_mtx` citeseq matrix .gz file - -`--assay` assay - diff --git a/docs/citeseq/count_cite.md b/docs/citeseq/count_cite.md deleted file mode 100644 index 55cbeaf4..00000000 --- a/docs/citeseq/count_cite.md +++ /dev/null @@ -1,13 +0,0 @@ - - -## Arguments -`--match_dir` matched scRNA-Seq CeleScope directory path - -`--outdir` output dir - -`--sample` sample name - -`--assay` assay - -`--read_count_file` tag read count file - diff --git a/docs/citeseq/mapping_tag.md b/docs/citeseq/mapping_tag.md deleted file mode 100644 index 0b77fe0a..00000000 --- a/docs/citeseq/mapping_tag.md +++ /dev/null @@ -1,21 +0,0 @@ - - -## Arguments -`--fq_pattern` read2 fastq pattern. - -`--barcode_fasta` barcode fasta. - -`--linker_fasta` linker fasta. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq` clean read2. - diff --git a/docs/citeseq/sample.md b/docs/citeseq/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/citeseq/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md deleted file mode 100644 index bfb0cb2a..00000000 --- a/docs/dynaseq/conversion.md +++ /dev/null @@ -1,26 +0,0 @@ -## Features -- Get conversion pos in each read. - - Get snp info. - -## Output -- `{sample}.PosTag.bam` Bam file with conversion info. -- `{sample}.PosTag.csv` SNP info in csv format. - - -## Arguments -`--strand` gene strand file - -`--bam` featureCount bam - -`--cell` barcode cell list - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md deleted file mode 100644 index 31ed90ce..00000000 --- a/docs/dynaseq/replace_tsne.md +++ /dev/null @@ -1,30 +0,0 @@ -## Features -- Replace rate in each cluster -- Top replace genes in each cluster - -## Output -- `{sample}.rep_in_tsne.txt` Replace rate in each cluster. -- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster. - - -## Arguments -`--tsne` tsne file - -`--mat` matrix rep file - -`--rep` cell rep file - -`--mincell` turn-over in at least cells, default 5 - -`--topgene` top N genes,default 10 - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md deleted file mode 100644 index 1184777c..00000000 --- a/docs/dynaseq/replacement.md +++ /dev/null @@ -1,36 +0,0 @@ -## Features -- Computes the replacement rates in each cell and gene. -- Boxplots for rates distribution. - -## Output -- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi. -- `{sample}.new_matrix.tsv.gz` New RNA matrix. -- `{sample}.old_matrix.tsv.gz` Old RNA matrix. -- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell. -- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene. -- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene. - - -## Arguments -`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format - -`--bam` bam file - -`--bg` background snp file - -`--cell_keep` filter cell - -`--min_cell` a gene expressed in at least cells, default 10 - -`--min_gene` at least gene num in a cell, default 10 - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md deleted file mode 100644 index e2b7b169..00000000 --- a/docs/dynaseq/subsitution.md +++ /dev/null @@ -1,20 +0,0 @@ -## Features -- Computes the overall conversion rates in reads and plots a barplot. - -## Output -- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates. - - -## Arguments -`--bam` bam file - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/fusion/count_fusion.md b/docs/fusion/count_fusion.md deleted file mode 100644 index 39063118..00000000 --- a/docs/fusion/count_fusion.md +++ /dev/null @@ -1,23 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--bam` None - -`--match_dir` match scRNA-Seq dir - -`--fusion_genomeDir` fusion genome directory - -`--flanking_base` None - -`--UMI_min` None - diff --git a/docs/fusion/mkref.md b/docs/fusion/mkref.md deleted file mode 100644 index 24ca414f..00000000 --- a/docs/fusion/mkref.md +++ /dev/null @@ -1,24 +0,0 @@ - - -## Arguments -`--genomeDir` Default='./'. Output directory. - -`--thread` Default=6. Threads to use. - -`--genome_name` Required, genome name. - -`--dry_run` Only write config file and exit. - -`--fasta` fusion fasta file - -`--fusion_pos` fusion position file. A two column tab-delimited text file with header. -"pos" is the end postion of the first gene(1-based). -e.g. -tag pos -PML_3 183 -PML_4 254 -PML_5 326 -PML_6 204 - -`--genomeSAindexNbases` STAR genomeSAindexNbases - diff --git a/docs/fusion/sample.md b/docs/fusion/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/fusion/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/fusion/star_fusion.md b/docs/fusion/star_fusion.md deleted file mode 100644 index 4ac60c75..00000000 --- a/docs/fusion/star_fusion.md +++ /dev/null @@ -1,32 +0,0 @@ - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads - -`--STAR_param` Other STAR parameters - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fusion_genomeDir` fusion gene STAR index genome directory - diff --git a/docs/hla/mapping_hla.md b/docs/hla/mapping_hla.md deleted file mode 100644 index d4f5f1e3..00000000 --- a/docs/hla/mapping_hla.md +++ /dev/null @@ -1,15 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--sample` sample name - -`--fq` None - -`--assay` assay - -`--match_dir` match scRNA-Seq dir - -`--thread` number of thread - diff --git a/docs/hla/sample.md b/docs/hla/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/hla/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/manual.md b/docs/manual.md deleted file mode 100755 index e9caf430..00000000 --- a/docs/manual.md +++ /dev/null @@ -1,38 +0,0 @@ -## Introduction -CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria. - -Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming. - -Currently, CeleScope includes the follwing pipelines: - -- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional). - -- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting. - -- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification. - - -## [Quick start](quick_start.md) - -## [Change log](CHANGELOG.md) - -## Pre-processing - -- [barcode](tools/barcode.md) -- [cutadapt](tools/cutadapt.md) - -## Single-cell rna -- [mkref](rna/mkref.md) -- [star](rna/star.md) -- [featureCounts](tools/featureCounts.md) -- [count](tools/count.md) -- [analysis](rna/analysis.md) -## Single-cell vdj -- [consensus](tools/consensus.md) -- [mapping_vdj](vdj/mapping_vdj.md) -- [count_vdj](vdj/count_vdj.md) -## Single-cell tag -- [mapping_tag](tag/mapping_tag.md) -- [count_tag](tag/count_tag.md) -- [analysis_tag](tag/analysis_tag.md) -- [split_tag](tag/split_tag.md) diff --git a/docs/manual_template.md b/docs/manual_template.md deleted file mode 100644 index c524de94..00000000 --- a/docs/manual_template.md +++ /dev/null @@ -1,23 +0,0 @@ -## Introduction -CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria. - -Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming. - -Currently, CeleScope includes the follwing pipelines: - -- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional). - -- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting. - -- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification. - - -## [Quick start](quick_start.md) - -## [Change log](CHANGELOG.md) - -## Pre-processing - -- [barcode](tools/barcode.md) -- [cutadapt](tools/cutadapt.md) - diff --git a/docs/methods/rna.txt b/docs/methods/rna.txt deleted file mode 100755 index 168a02a4..00000000 --- a/docs/methods/rna.txt +++ /dev/null @@ -1,10 +0,0 @@ -Single-cell transcriptomics and analysis - -Raw reads were processed to generate gene expression profiles using CeleScope v1.2.0(Singleron Biotechnologies) -with default parameters. Briefly, Barcodes and UMIs were extracted from R1 reads and corrected. Adapter sequences and poly A tails were trimmed from R2 reads and the trimmed R2 reads were aligned against the {GRCh38 (hg38)} {GRCm38 (mm10)} transcriptome -using STAR(v2.6.1b). Uniquely mapped reads were then assigned to exons with FeatureCounts(v2.0.1). Successfully Assigned Reads with the same cell barcode, UMI and gene were grouped together to generate the gene expression matrix for further analysis. - -Single-cell analyses were performed using the Seurat package(v4.0.1). All the variable genes selected by the FindVariableFeatures function were used to compute the PCs. The first 20 PCs and a resolution of 0.6 were used for clustering and tSNE visualization. - - - diff --git a/docs/mut/count_mut.md b/docs/mut/count_mut.md deleted file mode 100644 index 38699f30..00000000 --- a/docs/mut/count_mut.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--sample` sample name - -`--bam` None - -`--assay` assay - -`--mut_file` mutation file - -`--match_dir` match scRNA-Seq dir - -`--shift_base` None - diff --git a/docs/mut/mapping_mut.md b/docs/mut/mapping_mut.md deleted file mode 100644 index afccd1b6..00000000 --- a/docs/mut/mapping_mut.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--sample` sample name - -`--fq` None - -`--assay` assay - -`--indel_genomeDir` insertion or deletion STAR indexed genome directory - -`--thread` STAR thread - -`--outFilterMatchNmin` STAR outFilterMatchNmin - diff --git a/docs/mut/sample.md b/docs/mut/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/mut/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/quick_start.md b/docs/quick_start.md deleted file mode 100755 index 57d2327f..00000000 --- a/docs/quick_start.md +++ /dev/null @@ -1,110 +0,0 @@ -# Quick start - -CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of: - -- rna -- vdj -- tag - -Run `multi_{assay} -h` for help. - - -## Usage Example - -- Single-cell rna - - ``` - conda activate celescope - multi_rna\ - --mapfile ./rna.mapfile\ - --genomeDir /SGRNJ/Public/Database/genome/homo_mus\ - --thread 8\ - --mod shell - ``` -`--mapfile` Required. Mapfile path. - -`--genomeDir` Required. Required. Genome directory. - -`--thread` The recommended setting is 8, and the maximum should not exceed 20. - -`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. - -Scripts above will generate a `shell` directory containing `{sample}.sh` files. - -You can start your analysis by running: -``` -sh ./shell/{sample}.sh -``` - -- Single cell vdj - -``` -conda activate celescope -multi_vdj \ - --mapfile ./vdj.mapfile \ - --type TCR \ - --thread 8 \ - --mod shell -``` - -`--type` Required. TCR or BCR. - -- Single cell tag - -``` -conda activate celescope -multi_tag \ - --mapfile ./tag.mapfile\ - --barcode_fasta ./smk_barcode.fa\ - --fq_pattern L25C45\ - --mod shell -``` - -`--barcode_fasta` Required. Tag barcode fasta file. -``` ->tag_0 -GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC ->tag_1 -TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG ->tag_2 -AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA ->tag_3 -CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG -``` - -`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. - -`L` linker(common sequences) -`C` tag barcode - -## How to write mapfile - -Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files. - -1st column: Fastq file prefix. -2nd column: Fastq file directory path. -3rd column: Sample name, which is the prefix of all output files. -4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`. -- `rna` Optional, forced cell number. -- `vdj` Optional, matched_dir. -- `tag` Required, matched_dir. - -### Example - -Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1. -``` -$cat ./my.mapfile -fastq_prefix1 fastq_dir1 sample1 -fastq_prefix2 fastq_dir2 sample1 -fastq_prefix3 fastq_dir1 sample2 - -$ls fastq_dir1 -fastq_prefix1_1.fq.gz fastq_prefix1_2.fq.gz -fastq_prefix3_1.fq.gz fastq_prefix3_2.fq.gz - -$ls fastq_dir2 -fastq_prefix2_1.fq.gz fastq_prefix2_2.fq.gz -``` - - - diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md deleted file mode 100644 index 9ddfd1b3..00000000 --- a/docs/rna/analysis.md +++ /dev/null @@ -1,51 +0,0 @@ -## Features -- Cell clustering with Seurat. - -- Calculate the marker gene of each cluster. - -- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. - -## Output -- `markers.tsv` Marker genes of each cluster. - -- `tsne_coord.tsv` t-SNE coordinates and clustering information. - -- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` -parameter is provided. The result contains 3 files: - - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", -it means that the given marker is not enough to identify the cluster. - - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. - - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--save_rds` Write rds to disk. - -`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: -``` -cell_type marker -Alveolar "CLDN18,FOLR1,AQP4,PEBP4" -Endothelial "CLDN5,FLT1,CDH5,RAMP2" -Epithelial "CAPS,TMEM190,PIFO,SNTN" -Fibroblast "COL1A1,DCN,COL1A2,C1R" -B_cell "CD79A,IGKC,IGLC3,IGHG3" -Myeloid "LYZ,MARCO,FCGR3A" -T_cell "CD3D,TRBC1,TRBC2,TRAC" -LUAD "NKX2-1,NAPSA,EPCAM" -LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" -``` - -`--matrix_file` Required. Matrix_10X directory from step count. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md deleted file mode 100644 index c1b3d592..00000000 --- a/docs/rna/mkref.md +++ /dev/null @@ -1,38 +0,0 @@ -## Features -- Create a genome reference directory. - -## Output - -- STAR genome index files - -- Genome refFlat file - -- Genome config file -``` -$ cat celescope_genome.config -[genome] -genome_name = Homo_sapiens_ensembl_99 -genome_type = rna -fasta = Homo_sapiens.GRCh38.dna.primary_assembly.fa -gtf = Homo_sapiens.GRCh38.99.gtf -refflat = Homo_sapiens_ensembl_99.refFlat -``` - - -## Arguments -`--genomeDir` Default='./'. Output directory. - -`--thread` Default=6. Threads to use. - -`--genome_name` Required, genome name. - -`--dry_run` Only write config file and exit. - -`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir. - -`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir. - -`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir. -It is a plain text file with one gene per line. -If not provided, will use `MT-` and `mt-` to determine mitochondria genes. - diff --git a/docs/rna/star.md b/docs/rna/star.md deleted file mode 100644 index ec3b5211..00000000 --- a/docs/rna/star.md +++ /dev/null @@ -1,56 +0,0 @@ -## Features -- Align R2 reads to the reference genome with STAR. -- Collect Metrics with Picard. - -## Output -- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. - -- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. - -- `{sample}_Log.out` Main log with a lot of detailed information about the run. -This is most useful for troubleshooting and debugging. - -- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, -% of mapped reads etc. It is updated in 1 minute intervals. - -- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, -very useful for quality control. The statistics are calculated for each read (single- or paired-end) and -then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, -(unlike the samtools agstat/idxstats, which count each mate separately). -Most of the information is collected about the UNIQUE mappers -(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). -Each splicing is counted in the numbers of splices, which would correspond to -summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, -i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. - -- `{sample}_region.log` Picard CollectRnaSeqMetrics results. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads - -`--STAR_param` Other STAR parameters - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/rna_virus/analysis_rna_virus.md b/docs/rna_virus/analysis_rna_virus.md deleted file mode 100644 index 8893b4e1..00000000 --- a/docs/rna_virus/analysis_rna_virus.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--assay` assay - -`--sample` sample name - -`--thread` None - -`--debug` debug - -`--matrix_file` matrix file - -`--virus_file` virus UMI count file - diff --git a/docs/rna_virus/count.md b/docs/rna_virus/count.md deleted file mode 100644 index 182ecfa4..00000000 --- a/docs/rna_virus/count.md +++ /dev/null @@ -1,27 +0,0 @@ -## Features -- count - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--bam` None - -`--force_cell_num` force cell number. - -`--genomeDir` genome directory. - -`--gtf` gtf file path. - -`--expected_cell_num` expected cell number. - -`--cell_calling_method` cell calling methods. - diff --git a/docs/rna_virus/count_virus.md b/docs/rna_virus/count_virus.md deleted file mode 100644 index 60f09d5c..00000000 --- a/docs/rna_virus/count_virus.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--assay` assay - -`--sample` sample name - -`--thread` None - -`--debug` debug - -`--virus_bam` None - -`--barcode_file` None - diff --git a/docs/rna_virus/featureCounts.md b/docs/rna_virus/featureCounts.md deleted file mode 100644 index 61e7f284..00000000 --- a/docs/rna_virus/featureCounts.md +++ /dev/null @@ -1,19 +0,0 @@ - - -## Arguments -`--gtf_type` Specify feature type in GTF annotation. - -`--genomeDir` None - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--input` None - diff --git a/docs/rna_virus/sample.md b/docs/rna_virus/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/rna_virus/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/rna_virus/star.md b/docs/rna_virus/star.md deleted file mode 100644 index c0d71407..00000000 --- a/docs/rna_virus/star.md +++ /dev/null @@ -1,56 +0,0 @@ -## Features -- Align R2 reads to the reference genome with STAR. -- Collect Metrics with Picard. - -## ## Output -- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. - -- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. - -- `{sample}_Log.out` Main log with a lot of detailed information about the run. -This is most useful for troubleshooting and debugging. - -- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, -% of mapped reads etc. It is updated in 1 minute intervals. - -- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, -very useful for quality control. The statistics are calculated for each read (single- or paired-end) and -then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, -(unlike the samtools agstat/idxstats, which count each mate separately). -Most of the information is collected about the UNIQUE mappers -(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). -Each splicing is counted in the numbers of splices, which would correspond to -summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, -i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. - -- `{sample}_region.log` Picard CollectRnaSeqMetrics results. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads. - -`--STAR_param` Other STAR parameters. - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/rna_virus/star_virus.md b/docs/rna_virus/star_virus.md deleted file mode 100644 index 7ef14bd0..00000000 --- a/docs/rna_virus/star_virus.md +++ /dev/null @@ -1,32 +0,0 @@ - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases -is higher than or equal to this value. - -`--out_unmapped` Output unmapped reads - -`--STAR_param` Other STAR parameters - -`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. - -`--starMem` Default `30`. Maximum memory that STAR can use. - -`--fq` Required. R2 fastq file. - -`--consensus_fq` Input fastq has been consensused - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--virus_genomeDir` virus genome dir - diff --git a/docs/snp/analysis_snp.md b/docs/snp/analysis_snp.md deleted file mode 100644 index fb2bd136..00000000 --- a/docs/snp/analysis_snp.md +++ /dev/null @@ -1,23 +0,0 @@ - - -## Arguments -`--annovar_config` annovar soft config file - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--match_dir` match_dir - -`--vcf` vcf file - -`--CID_file` CID_file - -`--variant_count_file` variant count file - diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md deleted file mode 100644 index b78d34f8..00000000 --- a/docs/snp/mkref.md +++ /dev/null @@ -1,21 +0,0 @@ -## Features -- Create dictionary file and fasta index for gatk SplitNCigarReads. -(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) -Need to run `celescope rna mkref` first - -## Output -- fasta index -- gatk dictionary file - - -## Arguments -`--genomeDir` Default='./'. Output directory. - -`--thread` Default=6. Threads to use. - -`--genome_name` Required, genome name. - -`--dry_run` Only write config file and exit. - -`--fasta` fasta file - diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md deleted file mode 100644 index aed2d6fa..00000000 --- a/docs/snp/variant_calling.md +++ /dev/null @@ -1,38 +0,0 @@ -## Features -- Perform variant calling. - -## Output - -`{sample}_VID.tsv` A unique numeric ID is assigned for each variant. - -`{sample}_CID.tsv` A unique numeric ID is assigned for each cell. - -`{sample}_variant_count.tsv` Reference and variant supporting reads/UMIs count. - -`{sample}_support.mtx` Support matrix, only high quality bases are considered. -0 : no reads/UMIs cover the position. -1 : all reads/UMIs at the position support the ref allele. -2 : all reads/UMIs at the position support the alt allele. -3 : one or more reads/UMIs support both the alt and the ref allele. - - -## Arguments -`--genomeDir` Genome directory after running `mkref`. - -`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level -and use these variants as input vcf. - -`--bam` Input BAM file from step `target_metrics`. - -`--match_dir` Match celescope scRNA-Seq directory. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tag/analysis_tag.md b/docs/tag/analysis_tag.md deleted file mode 100644 index da0f476e..00000000 --- a/docs/tag/analysis_tag.md +++ /dev/null @@ -1,19 +0,0 @@ -## Features -- Combine scRNA-Seq clustering infromation with tag assignment. - - -## Arguments -`--tsne_tag_file` `{sample}_tsne_tag.tsv` from count_tag. - -`--match_dir` Match celescope scRNA-Seq directory. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tag/count_tag.md b/docs/tag/count_tag.md deleted file mode 100644 index 815b3eb1..00000000 --- a/docs/tag/count_tag.md +++ /dev/null @@ -1,44 +0,0 @@ -## Features -- Assign tag to each cell barcode and summarize. - -## Output - -- `{sample}_umi_tag.tsv` - - `first column` cell barcode - `last column` assigned tag - `columns between first and last` UMI count for each tag - -- `{sample}_tsne_tag.tsv` it is `{sample}_umi_tag.tsv` with t-SNE coordinates, gene_counts and cluster infomation - -- `{sample}_cluster_count.tsv` cell barcode number assigned to *undeterminded*, *multiplet* and *each tag* - - -## Arguments -`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*. - -`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag. - -`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. -Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. - -`--combine_cluster` Conbine cluster tsv file. - -`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as -`SNR_min = max(median(SNRs) * coefficient, 2)`. -Smaller `coefficient` will cause less *multiplet* in the tag assignment. - -`--read_count_file` Tag read count file. - -`--match_dir` Match celescope scRNA-Seq directory. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tag/mapping_tag.md b/docs/tag/mapping_tag.md deleted file mode 100644 index 1f7ab1ae..00000000 --- a/docs/tag/mapping_tag.md +++ /dev/null @@ -1,48 +0,0 @@ -## Features -- Align R2 reads to the tag barcode fasta. - -## Output - -- `{sample}_read_count.tsv` tab-delimited text file with 4 columns. - - `barcode` cell barcode - `tag_name` tag name in barcode_fasta - `UMI` UMI sequence - `read_count` read count per UMI - - -## Arguments -`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. -`L` linker(common sequences) -`C` tag barcode - -`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode -sequence in R2 reads with all tag barcode sequence in barcode_fasta. -It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. -If no such tag exists, the read is classified as invalid. -``` ->tag_0 -GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC ->tag_1 -TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG ->tag_2 -AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA ->tag_3 -CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG -``` - -`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads -with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq` R2 read fastq. - diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md deleted file mode 100644 index 5a43f7f8..00000000 --- a/docs/tag/split_tag.md +++ /dev/null @@ -1,26 +0,0 @@ -## Features -- Split scRNA-Seq fastq according to tag assignment. - -## Output -- `fastq/{tag}_{1,2}.fq` Fastq files of each tag. - - -## Arguments -`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment. - -`--umi_tag_file` UMI tag file. - -`--match_dir` Match celescope scRNA-Seq directory. - -`--R1_read` R1 read path. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tcr_fl/assemble.md b/docs/tcr_fl/assemble.md deleted file mode 100644 index 95662af1..00000000 --- a/docs/tcr_fl/assemble.md +++ /dev/null @@ -1,15 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fastq_dir` None - diff --git a/docs/tcr_fl/sample.md b/docs/tcr_fl/sample.md deleted file mode 100644 index 87ee3cfe..00000000 --- a/docs/tcr_fl/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file. - -`--chemistry` chemistry version. - diff --git a/docs/tcr_fl/split_fq.md b/docs/tcr_fl/split_fq.md deleted file mode 100644 index fe767158..00000000 --- a/docs/tcr_fl/split_fq.md +++ /dev/null @@ -1,15 +0,0 @@ - - -## Arguments -`--outdir` output dir - -`--sample` sample name - -`--fq` None - -`--assay` assay - -`--match_dir` match scRNA-Seq dir - -`--nCell` select top N cell - diff --git a/docs/tools/barcode.md b/docs/tools/barcode.md deleted file mode 100644 index 9f31bd94..00000000 --- a/docs/tools/barcode.md +++ /dev/null @@ -1,61 +0,0 @@ -## Features - -- Demultiplex barcodes. -- Filter invalid R1 reads, which includes: - - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2. - - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1. - - Reads without polyT: the number of T bases in the defined polyT region is less than 10. - - Low quality reads: low sequencing quality in barcode and UMI regions. - -## Output - -- `01.barcode/{sample}_2.fq(.gz)` Demultiplexed R2 reads. Barcode and UMI are contained in the read name. The format of -the read name is `{barcode}_{UMI}_{read ID}`. - - -## Arguments -`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: -- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. -- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. -- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the -same time. - -`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number - of bases. -- `C`: cell barcode -- `L`: linker(common sequences) -- `U`: UMI -- `T`: poly T - -`--whitelist` Cell barcode whitelist file path, one cell barcode per line. - -`--linker` Linker whitelist file path, one linker per line. - -`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. - -`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. - -`--nopolyT` Outputs R1 reads without polyT. - -`--noLinker` Outputs R1 reads without correct linker. - -`--allowNoPolyT` Allow valid reads without polyT. - -`--allowNoLinker` Allow valid reads without correct linker. - -`--gzip` Output gzipped fastq files. - -`--fq1` R1 fastq file. Multiple files are separated by comma. - -`--fq2` R2 fastq file. Multiple files are separated by comma. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tools/consensus.md b/docs/tools/consensus.md deleted file mode 100644 index 77e11286..00000000 --- a/docs/tools/consensus.md +++ /dev/null @@ -1,24 +0,0 @@ -## Features -- Consensus all the reads of the same (barcode, UMI) combinations into one read(UMI). - -## Output -- `{sample}_consensus.fq` Consensus fastq. - - -## Arguments -`--threshold` Default 0.5. Valid base threshold. - -`--not_consensus` Skip the consensus step. - -`--fq` Required. Fastq file. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tools/count.md b/docs/tools/count.md deleted file mode 100644 index 102eb020..00000000 --- a/docs/tools/count.md +++ /dev/null @@ -1,61 +0,0 @@ -## Features -- Cell-calling: Distinguish cell barcodes from background barcodes. - -- Generate expression matrix. - -## Output -- `{sample}_all_matrix` The expression matrix of all detected barcodes. - Can be read in by calling the `Seurat::Read10X` function. - -- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. -Can be read in by calling the `Seurat::Read10X` function. - -- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. -CeleScope >=1.2.0 does not output this file. - -- `{sample}_count_detail.txt.gz` 4 columns: - - barcode - - gene ID - - UMI count - - read_count - -- `{sample}_counts.txt` 6 columns: - - Barcode: barcode sequence - - readcount: read count of each barcode - - UMI2: UMI count (with reads per UMI >= 2) for each barcode - - UMI: UMI count for each barcode - - geneID: gene count for each barcode - - mark: cell barcode or backgound barcode. - - `CB` cell - `UB` background - -- `{sample}_downsample.txt` 3 columns: - - percent: percentage of sampled reads - - median_geneNum: median gene number per cell - - saturation: sequencing saturation - -- `barcode_filter_magnitude.pdf` Barcode-UMI plot. - - -## Arguments -`--genomeDir` Required. Genome directory. - -`--expected_cell_num` Default `3000`. Expected cell number. - -`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--bam` Required. BAM file from featureCounts. - -`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%. - diff --git a/docs/tools/cutadapt.md b/docs/tools/cutadapt.md deleted file mode 100644 index e75d6e72..00000000 --- a/docs/tools/cutadapt.md +++ /dev/null @@ -1,44 +0,0 @@ -## Features -- Trim adapters in R2 reads with cutadapt. Default adapters includes: - - polyT=A{18}, 18 A bases. - - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. - -## Output -- `cutadapt.log` Cutadapt output log file. -- `{sample}_clean_2.fq.gz` R2 reads file without adapters. - - -## Arguments -`--adapter_fasta` Addtional adapter fasta file. - -`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. - -`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). -Some Illumina instruments use a two-color chemistry to encode the four bases. -This includes the NextSeq and the NovaSeq. -In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. -However, dark cycles also occur when sequencing “falls off” the end of the fragment. -The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. - -`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, -short matches can occur by chance, leading to erroneously trimmed bases. -For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. -To reduce the number of falsely trimmed bases, the alignment algorithm requires that -at least {overlap} bases match between adapter and read. - -`--insert` Default `150`. Read2 insert length. - -`--fq` Required. R2 reads from step Barcode. - -`--gzip` Output gzipped fastq - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tools/featureCounts.md b/docs/tools/featureCounts.md deleted file mode 100644 index 3822ca55..00000000 --- a/docs/tools/featureCounts.md +++ /dev/null @@ -1,38 +0,0 @@ -## Features - -- Assigning uniquely mapped reads to genomic features with FeatureCounts. - -## Output -- `{sample}` Numbers of reads assigned to features (or meta-features). - -- `{sample}_summary` Stat info for the overall summrization results, including number of -successfully assigned reads and number of reads that failed to be assigned due to -various reasons (these reasons are included in the stat info). - -- `{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam` featureCounts output BAM, -sorted by coordinates;BAM file contains tags as following(Software Version>=1.1.8): - - CB cell barcode - - UB UMI - - GN gene name - - GX gene id - -- `{sample}_name_sorted.bam` featureCounts output BAM, sorted by read name. - - -## Arguments -`--gtf_type` Specify feature type in GTF annotation - -`--genomeDir` Required. Genome directory. - -`--input` Required. BAM file path. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/tools/sample.md b/docs/tools/sample.md deleted file mode 100644 index e6fb6ce3..00000000 --- a/docs/tools/sample.md +++ /dev/null @@ -1,17 +0,0 @@ - - -## Arguments -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -`--fq1` read1 fq file - -`--chemistry` chemistry version - diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md deleted file mode 100644 index d2fbaa04..00000000 --- a/docs/tools/target_metrics.md +++ /dev/null @@ -1,28 +0,0 @@ -## Features -- Filter bam file - - Filter reads that are not cell-associated. - - Filter reads that are not mapped to target genes. - -- Collect enrichment metrics. - -## Output -- `filtered.bam` BAM file after filtering. - - -## Arguments -`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. - -`--bam` Input bam file - -`--match_dir` Match celescope scRNA-Seq directory. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/vdj/count_vdj.md b/docs/vdj/count_vdj.md deleted file mode 100644 index bd10f86d..00000000 --- a/docs/vdj/count_vdj.md +++ /dev/null @@ -1,37 +0,0 @@ -## Features -- Cell-calling based on barcode-UMI rank. -- Summarize clonetypes infomation. - -## Output -- `{sample}_cell_confident.tsv` The clone type of VDJ cell barcode, each chain occupies one line. - -- `{sample}_cell_confident_count.tsv` The clone type of VDJ cell barcode, each cell occupies one line. - -- `{sample}_clonetypes.tsv` The count and percentage of each clonetypes of VDJ cell barcode. - -- `{sample}_match_clonetypes.tsv` When summarize clonetypes, only consider barcodes in the match scRNA-Seq library. -This file will only be produced when the `match_dir` parameter is provided. - - -## Arguments -`--type` Required. `TCR` or `BCR`. - -`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell. - -`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. -For each (barcode, chain) combination, only UMI>=iUMI is considered valid. - -`--UMI_count_filter_file` Required. File from step mapping_vdj. - -`--match_dir` Match celescope scRNA-Seq directory. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - diff --git a/docs/vdj/mapping_vdj.md b/docs/vdj/mapping_vdj.md deleted file mode 100644 index 25bb304a..00000000 --- a/docs/vdj/mapping_vdj.md +++ /dev/null @@ -1,35 +0,0 @@ -## Features -- Align R2 reads to IGMT(http://www.imgt.org/) database sequences with mixcr. - -## Output -- `{sample}_consensus.fasta` Fasta file after UMI consensus. - -- `{sample}_UMI_count_unfiltered.tsv` UMI reading for each (barcode, chain, VJ_pair) combination. - -- `{sample}_UMI_count_filtered.tsv` For each (barcode, chain) combination, only the record with the -most VJ_pair UMI reads is kept. - -- `{sample}_align.txt` Result report. - -- `{sample}_alignments.txt` The alignment result of each UMI/read. - - -## Arguments -`--type` TCR or BCR - -`--species` Default `hs`. `hs`(human) or `mmu`(mouse). - -`--not_consensus` Input fastq is not consensused. - -`--fq` Required. Input fastq file. - -`--outdir` Output diretory. - -`--assay` Assay name. - -`--sample` Sample name. - -`--thread` Thread to use. - -`--debug` If this argument is used, celescope may output addtional file for debugging. - -- Gitee From 0681e783c0ebe8fbc702c28e23cb4e1dcf76448c Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 13:15:13 +0800 Subject: [PATCH 86/96] refactor --- celescope/snp/mkref.py | 4 +- celescope/tools/multi.py | 41 +++---- celescope/tools/utils.py | 106 ------------------ docs_template/CHANGELOG.md | 178 +++++++++++++++++++++++++++++++ docs_template/CONTRIBUTING.md | 106 ++++++++++++++++++ docs_template/manual_template.md | 23 ++++ docs_template/quick_start.md | 110 +++++++++++++++++++ generate_docs.py | 90 ++++++++++------ methods/rna.txt | 10 ++ 9 files changed, 508 insertions(+), 160 deletions(-) create mode 100755 docs_template/CHANGELOG.md create mode 100644 docs_template/CONTRIBUTING.md create mode 100644 docs_template/manual_template.md create mode 100755 docs_template/quick_start.md create mode 100755 methods/rna.txt diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py index 32fd2dd0..e8659428 100644 --- a/celescope/snp/mkref.py +++ b/celescope/snp/mkref.py @@ -21,8 +21,8 @@ class Mkref_snp(Mkref): Usage ``` # run celescope rna mkref first - celescope snp mkref \ - --genome_name Homo_sapiens_ensembl_99 \ + celescope snp mkref \\ + --genome_name Homo_sapiens_ensembl_99 \\ --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa ``` """ diff --git a/celescope/tools/multi.py b/celescope/tools/multi.py index 4df0d7e8..75b3ea7e 100755 --- a/celescope/tools/multi.py +++ b/celescope/tools/multi.py @@ -20,8 +20,6 @@ class Multi(): self.__STEPS__ = init_module.__STEPS__ self.__CONDA__ = os.path.basename(os.environ['CONDA_DEFAULT_ENV']) self.__APP__ = 'celescope' - self.col4_default = None - self.last_step = '' self.steps_not_run = ['mkref'] # remove @@ -29,31 +27,26 @@ class Multi(): if step in self.__STEPS__: self.__STEPS__.remove(step) - # parse_args + # add args + self.parser = None self.common_args() self.step_args() - self.args = self.parser.parse_args() - if self.args.gzip: - self.fq_suffix = ".gz" - else: - self.fq_suffix = "" - if self.args.steps_run == 'all': - self.steps_run = self.__STEPS__ - elif self.args.steps_run: - self.steps_run = self.args.steps_run.strip().split(',') - # init + # set + self.args = None + self.col4_default = None + self.last_step = '' + self.fq_suffix = "" + self.steps_run = self.__STEPS__ self.fq_dict = {} self.col4_dict = {} self.col5_dict = {} - self.logdir = self.args.outdir + '/log' + self.logdir = None - # script init - self.sjm_cmd = f'log_dir {self.logdir}\n' + self.sjm_cmd = '' self.sjm_order = '' self.shell_dict = defaultdict(str) - # outdir dict self.outdir_dic = {} @@ -90,7 +83,7 @@ class Multi(): @staticmethod @utils.add_log - def parse_map_col4(mapfile, default_val): + def parse_mapfile(mapfile, default_val): fq_dict = defaultdict(list) col4_dict = {} col5_dict = {} @@ -137,8 +130,18 @@ class Multi(): """ parse_mapfile, make log dir, init script variables, init outdir_dic """ + self.args = self.parser.parse_args() + + if self.args.gzip: + self.fq_suffix = ".gz" + if self.args.steps_run != 'all': + self.steps_run = self.args.steps_run.strip().split(',') + + self.logdir = self.args.outdir + '/log' + self.sjm_cmd = f'log_dir {self.logdir}\n' + # parse_mapfile - self.fq_dict, self.col4_dict, self.col5_dict = self.parse_map_col4(self.args.mapfile, self.col4_default) + self.fq_dict, self.col4_dict, self.col5_dict = self.parse_mapfile(self.args.mapfile, self.col4_default) # mk log dir if self.args.mod == 'sjm': diff --git a/celescope/tools/utils.py b/celescope/tools/utils.py index 36e217e5..65c0f4ad 100755 --- a/celescope/tools/utils.py +++ b/celescope/tools/utils.py @@ -419,80 +419,6 @@ def get_fq(library_id, library_path): return fq1, fq2 -@add_log -def parse_map_col4(mapfile, default_val): - fq_dict = defaultdict(list) - col4_dict = defaultdict(list) - col5_dict = defaultdict(list) - with open(mapfile) as fh: - for line in fh: - line = line.strip() - if not line: - continue - if line.startswith('#'): - continue - tmp = line.split() - library_id = tmp[0] - library_path = tmp[1] - sample_name = tmp[2] - if len(tmp) >= 4: - col4 = tmp[3] - else: - col4 = default_val - fq1, fq2 = get_fq(library_id, library_path) - - if sample_name in fq_dict: - fq_dict[sample_name][0].append(fq1) - fq_dict[sample_name][1].append(fq2) - else: - fq_dict[sample_name] = [[fq1], [fq2]] - if col4 and col4 != default_val: - col4_dict[sample_name] = col4 - if len(tmp) == 5: - col5_dict[sample_name] = tmp[4] - - for sample_name in fq_dict: - fq_dict[sample_name][0] = ",".join(fq_dict[sample_name][0]) - fq_dict[sample_name][1] = ",".join(fq_dict[sample_name][1]) - - if not fq_dict: - raise Exception('empty mapfile!') - return fq_dict, col4_dict, col5_dict - - -def generate_sjm(cmd, name, conda, m=1, x=1): - res_cmd = f''' -job_begin - name {name} - sched_options -w n -cwd -V -l vf={m}g,p={x} - cmd source activate {conda}; {cmd} -job_end -''' - - return res_cmd - - -def merge_report( - fq_dict, steps, last_step, sjm_cmd, - sjm_order, logdir, conda, outdir, rm_files): - step = "merge_report" - steps_str = ",".join(steps) - samples = ','.join(fq_dict.keys()) - app = tools_dir + '/merge_table.py' - cmd = ( - f'python {app} --samples {samples} ' - f'--steps {steps_str} --outdir {outdir}' - ) - if rm_files: - cmd += ' --rm_files' - sjm_cmd += generate_sjm(cmd, 'merge_report', conda) - for sample in fq_dict: - sjm_order += f'order {step} after {last_step}_{sample}\n' - with open(logdir + '/sjm.job', 'w') as fh: - fh.write(sjm_cmd + '\n') - fh.write(sjm_order) - - def format_number(number: int) -> str: return format(number, ",") @@ -533,38 +459,6 @@ def genDict(dim=3, valType=int): return defaultdict(lambda: genDict(dim - 1, valType=valType)) -def cluster_tsne_list(tsne_df): - """ - tSNE_1 tSNE_2 cluster Gene_Counts - return data list - """ - sum_df = tsne_df.groupby(["cluster"]).agg("count").iloc[:, 0] - percent_df = sum_df.transform(lambda x: round(x / sum(x) * 100, 2)) - res = [] - for cluster in sorted(tsne_df.cluster.unique()): - sub_df = tsne_df[tsne_df.cluster == cluster] - name = "cluster {cluster}({percent}%)".format( - cluster=cluster, percent=percent_df[cluster]) - tSNE_1 = list(sub_df.tSNE_1) - tSNE_2 = list(sub_df.tSNE_2) - res.append({"name": name, "tSNE_1": tSNE_1, "tSNE_2": tSNE_2}) - return res - - -def marker_table(marker_df): - """ - return html code - """ - marker_df = marker_df.loc[:, ["cluster", "gene", - "avg_log2FC", "pct.1", "pct.2", "p_val_adj"]] - marker_gene_table = marker_df.to_html( - escape=False, - index=False, - table_id="marker_gene_table", - justify="center") - return marker_gene_table - - def report_prepare(outdir, **kwargs): json_file = outdir + '/../.data.json' if not os.path.exists(json_file): diff --git a/docs_template/CHANGELOG.md b/docs_template/CHANGELOG.md new file mode 100755 index 00000000..109dd6d7 --- /dev/null +++ b/docs_template/CHANGELOG.md @@ -0,0 +1,178 @@ +# Change Log + +## [unreleased] - 2021-06-09 +### Added + +### Changed + +### Fixed +- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`. + +### Removed +- `celescope.tools.utils.glob_genomeDir` + +## [1.3.1] - 2021-06-09 +### Added + +- Add wdl workflow. + +- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549. + +### Changed + +- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered) + +### Fixed and Removed + +- Remove h5 file generation in R to avoid memory issues. + + +## [1.3.0] - 2021-05-28 + +### Added + +- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details. + +### Changed + +- Change the way to handle duplicate gene_name and gene_id in gtf file. + +Previous: + + - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. + - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name. + - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name. + +Now: + + - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. + - one gene_id with multiple gene_name: error. + - duplicated (gene_name, gene_id): ignore duplicated records and print a warning. + +### Fixed + +- Fix `count tag` metrics order in merge.xls + +### Removed + +- Remove `--fusion_pos` from `celescope.fusion.count_fusion` + + +## [1.2.0] - 2021-05-19 + +### Added + +- Assay `rna` outputs .h5 file in 06.analysis directory. + +### Changed + +- Update Seurat from 2.3.4 to 4.0.1. + +- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding. + +- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136 + +### Removed + +- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory. + + +## [1.1.9] - 2021-04-25 + +### Added + +- Add parameter `--coefficient` to `celescope tag count_tag` and `multi_tag` + + Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)` + +- Add `.metrics.json` + +- Add `scopeV1` chemistry support. + +### Changed + +- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage). + +- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage. + +- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output. + +- Change the display of Barcode-rank plot in html report. + +### Fixed + +- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2. + +- Fix an error when Numpy >= 1.2.0. + +- VDJ merge.xls can display all the metrics correctly. + +### Removed + +- Remove fastqc from `barcode` step. + + +## [1.1.8] - 2021-03-26 + +### Added + +- Add read consensus to VDJ pipeline. + + A consensus step was added before mapping to merge all the reads of the same + (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`. + multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7. + +- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`. + + `--species` can be one of: + - `hs`: human + - `mmu`: mouse + +- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`. + + `--cell_calling_method` can be one of: + - `auto`: Same result as v1.1.7. + - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3. + - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points. + +- Add 4 tags to featureCounts bam. + + - `CB`: cell barcode + - `UB`: UMI + - `GN`: gene name + - `GX`: gene id + +- Add `--STAR_param` to `celescope rna STAR` + + Additional parameters of STAR can be passed into the `STAR` step. + +### Changed + +- One sample can have different chemistry fastq in mapfile. Version <= v1.1.7 will report this as an error. + +- Gtf file can be gzipped. + +- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` + +- Step `snpCalling` use mutract. + + +## [1.1.7] - 2020-12-16 + +### Added + +- Automatically detect Singleron chemistry version. + +### Changed + +- FeatureCounts use strand specificity. + +- Cutadapt default `overlap` change from `5` to `10`. + +- VDJ sort `NA` last. + +- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID. + + + + diff --git a/docs_template/CONTRIBUTING.md b/docs_template/CONTRIBUTING.md new file mode 100644 index 00000000..fb9ee986 --- /dev/null +++ b/docs_template/CONTRIBUTING.md @@ -0,0 +1,106 @@ +## Pull Requests +Create pull requests to `dev` branch + +## Lint code +Before pull requests, you should lint your code with the following command: +``` +pip install pylint +# lint +# W1618 (no-absolute-import) +# E1101 (no-member) +# W1633 (round-builtin) +# W1619 (old-division) +# W0105 (String statement has no effect) +# W0511 TODO! +# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type) +# W0212 Access to a protected member _option_string_actions of a client class (protected-access) +pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope +``` +Your code should be rated at 10(i.e. no error or warning). + +## Write a new step +When you add a new step, you need to + - Create a step class which inherit from `celescope.tools.step.Step`. + - Create a function with the same name of the module. The main function `celescope` uses this function to run each step. + - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface. + +For example, in `celescope.tools.cutadapt`: + +``` +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + + +class Cutadapt(Step): + """ + Features + - Trim adapters in R2 reads with cutadapt. Default adapters includes: + - polyT=A{18}, 18 A bases. + - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. + + Output + - `cutadapt.log` Cutadapt output log file. + - `{sample}_clean_2.fq.gz` R2 reads file without adapters. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + {some init code} + + @utils.add_log + def run(self): + {some code to run} + + +@utils.add_log +def cutadapt(args): + step_name = "cutadapt" + cutadapt_obj = Cutadapt(args, step_name) + cutadapt_obj.run() + + +def get_opts_cutadapt(parser, sub_program): + parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.') + parser.add_argument( + '--minimum_length', + help='Default `20`. Discard processed reads that are shorter than LENGTH.', + default=20 + ) + {other arguments} + if sub_program: + parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True) + parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true') + parser = s_common(parser) + return parser +``` + +## Docs +There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect: +- Docstring of the step class. The Docstring should have sections named `Features` and `Output`. +- Help infomation in `get_opts_{module_name}` + +Released assays will be added to `manual.md`. + +## Tests +If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example: + + +1. Get test data +``` +# If you have access to Singleron Nanjing HPC +copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir} +# Or clone from repo +git clone https://github.com/singleron-RD/celescope_tests.git +``` + +2. Run `pytest` +``` +Install pytest +>>> pip install pytest +Run all +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} +Run some tests +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag +``` + +Then you need to create your own test based on this example. \ No newline at end of file diff --git a/docs_template/manual_template.md b/docs_template/manual_template.md new file mode 100644 index 00000000..c524de94 --- /dev/null +++ b/docs_template/manual_template.md @@ -0,0 +1,23 @@ +## Introduction +CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria. + +Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming. + +Currently, CeleScope includes the follwing pipelines: + +- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional). + +- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting. + +- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification. + + +## [Quick start](quick_start.md) + +## [Change log](CHANGELOG.md) + +## Pre-processing + +- [barcode](tools/barcode.md) +- [cutadapt](tools/cutadapt.md) + diff --git a/docs_template/quick_start.md b/docs_template/quick_start.md new file mode 100755 index 00000000..57d2327f --- /dev/null +++ b/docs_template/quick_start.md @@ -0,0 +1,110 @@ +# Quick start + +CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of: + +- rna +- vdj +- tag + +Run `multi_{assay} -h` for help. + + +## Usage Example + +- Single-cell rna + + ``` + conda activate celescope + multi_rna\ + --mapfile ./rna.mapfile\ + --genomeDir /SGRNJ/Public/Database/genome/homo_mus\ + --thread 8\ + --mod shell + ``` +`--mapfile` Required. Mapfile path. + +`--genomeDir` Required. Required. Genome directory. + +`--thread` The recommended setting is 8, and the maximum should not exceed 20. + +`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. + +Scripts above will generate a `shell` directory containing `{sample}.sh` files. + +You can start your analysis by running: +``` +sh ./shell/{sample}.sh +``` + +- Single cell vdj + +``` +conda activate celescope +multi_vdj \ + --mapfile ./vdj.mapfile \ + --type TCR \ + --thread 8 \ + --mod shell +``` + +`--type` Required. TCR or BCR. + +- Single cell tag + +``` +conda activate celescope +multi_tag \ + --mapfile ./tag.mapfile\ + --barcode_fasta ./smk_barcode.fa\ + --fq_pattern L25C45\ + --mod shell +``` + +`--barcode_fasta` Required. Tag barcode fasta file. +``` +>tag_0 +GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC +>tag_1 +TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG +>tag_2 +AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA +>tag_3 +CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG +``` + +`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. + +`L` linker(common sequences) +`C` tag barcode + +## How to write mapfile + +Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files. + +1st column: Fastq file prefix. +2nd column: Fastq file directory path. +3rd column: Sample name, which is the prefix of all output files. +4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`. +- `rna` Optional, forced cell number. +- `vdj` Optional, matched_dir. +- `tag` Required, matched_dir. + +### Example + +Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1. +``` +$cat ./my.mapfile +fastq_prefix1 fastq_dir1 sample1 +fastq_prefix2 fastq_dir2 sample1 +fastq_prefix3 fastq_dir1 sample2 + +$ls fastq_dir1 +fastq_prefix1_1.fq.gz fastq_prefix1_2.fq.gz +fastq_prefix3_1.fq.gz fastq_prefix3_2.fq.gz + +$ls fastq_dir2 +fastq_prefix2_1.fq.gz fastq_prefix2_2.fq.gz +``` + + + diff --git a/generate_docs.py b/generate_docs.py index a43105a0..e4788abc 100644 --- a/generate_docs.py +++ b/generate_docs.py @@ -1,6 +1,7 @@ import argparse import inspect import os +import importlib from collections import defaultdict import celescope.tools.utils as utils @@ -8,36 +9,13 @@ from celescope.celescope import ArgFormatter from celescope.__init__ import ASSAY_DICT, RELEASED_ASSAYS PRE_PROCESSING_STEPS = ('sample', 'barcode', 'cutadapt') -DOCS_ROOT = 'docs' -MANUAL_MD = f'{DOCS_ROOT}/manual.md' -MANUAL_TEMPLATE = f'{DOCS_ROOT}/manual_template.md' +DOCS_DIR = 'docs/' +TEMPLATE_DIR = 'docs_template/' +MANUAL_MD = f'{DOCS_DIR}/manual.md' +MANUAL_TEMPLATE = f'{DOCS_DIR}/manual_template.md' -def generate_single_step_doc(assay, step): - """ - Returns: - - md file relative to DOCS_ROOT - """ - step_module, folder = utils.find_step_module_with_folder(assay, step) - func_opts = getattr(step_module, f"get_opts_{step}") - - class_docs = get_class_docs(step_module) - argument_docs = get_argument_docs(func_opts) - - folder_path = f'{DOCS_ROOT}/{folder}/' - if not os.path.exists(folder_path): - os.system(f'mkdir -p {folder_path}') - - out_md = f'{DOCS_ROOT}/{folder}/{step}.md' - with open(out_md, 'w') as out_file: - out_file.write(class_docs) - out_file.write(argument_docs) - return f'{folder}/{step}.md' - -def get_argument_docs(func_opts): - argument_docs = "" - parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter) - func_opts(parser, sub_program=True) +def get_argument_docs_from_parser(parser): for argument in parser._option_string_actions: if not argument in ['-h', '--help']: help_msg = parser._option_string_actions[argument].help @@ -69,6 +47,46 @@ def get_class_docs(step_module): return class_docs +class Docs(): + def __init__(self, assay): + self.assay = assay + + init_module = utils.find_assay_init(assay) + self.steps = init_module.__STEPS__ + self.steps.append(f'multi_{assay}') + folder = f'{DOCS_DIR}/{assay}/' + + self.out_md_dict = {} + self.relative_md_path = {} + for step in self.steps: + self.out_md_dict[step] = f'{folder}/{step}.md' + self.relative_md_path[step] = f'{assay}/{step}.md' + + if not os.path.exists(folder): + os.system(f'mkdir -p {folder}') + + def get_argument_docs(self, step, step_module): + if step.startswith("multi"): + multi_class = getattr(step_module, f'Multi_{self.assay}') + multi_obj = multi_class(self.assay) + argument_docs = get_argument_docs_from_parser(multi_obj.parser) + else: + parser = argparse.ArgumentParser(description='CeleScope', formatter_class=ArgFormatter) + func_opts = getattr(step_module, f"get_opts_{step}") + func_opts(parser, sub_program=True) + argument_docs = get_argument_docs_from_parser(parser) + return argument_docs + + + def write_step_doc(self, step): + step_module = utils.find_step_module(self.assay, step) + class_docs = get_class_docs(step_module) + argument_docs = self.get_argument_docs(step, step_module) + + with open(self.out_md_dict[step], 'w') as out_file: + out_file.write(class_docs) + out_file.write(argument_docs) + def write_step_in_manual(md_path, step, manual_handle): """ - [mkref](rna/mkref.md) @@ -77,19 +95,24 @@ def write_step_in_manual(md_path, step, manual_handle): manual_handle.write(f'- [{step}]({md_path})\n') - +""" @utils.add_log def generate_all_docs(): md_path_dict = defaultdict(dict) + for assay in ASSAY_DICT: init_module = utils.find_assay_init(assay) - __STEPS__ = init_module.__STEPS__ + steps = init_module.__STEPS__ generate_all_docs.logger.info(f"Writing docs {assay} ") - for step in __STEPS__: + + steps.append(f'multi_{assay}') + for step in steps: generate_all_docs.logger.info(f"Writing doc {assay}.{step}") md_path = generate_single_step_doc(assay, step) md_path_dict[assay][step] = md_path return md_path_dict +""" + @utils.add_log def write_manual(md_path_dict): @@ -108,5 +131,6 @@ def write_manual(md_path_dict): if __name__ == "__main__": - md_path_dict = generate_all_docs() - write_manual(md_path_dict) \ No newline at end of file + cmd = f"cp -r {TEMPLATE_DIR} {DOCS_DIR}" + os.system(cmd) + \ No newline at end of file diff --git a/methods/rna.txt b/methods/rna.txt new file mode 100755 index 00000000..168a02a4 --- /dev/null +++ b/methods/rna.txt @@ -0,0 +1,10 @@ +Single-cell transcriptomics and analysis + +Raw reads were processed to generate gene expression profiles using CeleScope v1.2.0(Singleron Biotechnologies) +with default parameters. Briefly, Barcodes and UMIs were extracted from R1 reads and corrected. Adapter sequences and poly A tails were trimmed from R2 reads and the trimmed R2 reads were aligned against the {GRCh38 (hg38)} {GRCm38 (mm10)} transcriptome +using STAR(v2.6.1b). Uniquely mapped reads were then assigned to exons with FeatureCounts(v2.0.1). Successfully Assigned Reads with the same cell barcode, UMI and gene were grouped together to generate the gene expression matrix for further analysis. + +Single-cell analyses were performed using the Seurat package(v4.0.1). All the variable genes selected by the FindVariableFeatures function were used to compute the PCs. The first 20 PCs and a resolution of 0.6 were used for clustering and tSNE visualization. + + + -- Gitee From e28a9c82836225c917897a8bc35fada955301682 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 14:10:52 +0800 Subject: [PATCH 87/96] docs --- celescope/citeseq/multi_citeseq.py | 6 +- celescope/hla/multi_hla.py | 4 + celescope/snp/multi_snp.py | 21 +++ docs/CHANGELOG.md | 178 +++++++++++++++++++ docs/CONTRIBUTING.md | 106 +++++++++++ docs/capture_rna/count_capture_rna.md | 61 +++++++ docs/capture_rna/multi_capture_rna.md | 106 +++++++++++ docs/capture_virus/analysis_capture_virus.md | 19 ++ docs/capture_virus/count_capture_virus.md | 19 ++ docs/capture_virus/mkref.md | 15 ++ docs/capture_virus/multi_capture_virus.md | 94 ++++++++++ docs/citeseq/analysis_cite.md | 13 ++ docs/citeseq/count_cite.md | 13 ++ docs/citeseq/multi_citeseq.md | 95 ++++++++++ docs/dynaseq/conversion.md | 26 +++ docs/dynaseq/multi_dynaseq.md | 110 ++++++++++++ docs/dynaseq/replace_tsne.md | 30 ++++ docs/dynaseq/replacement.md | 36 ++++ docs/dynaseq/subsitution.md | 20 +++ docs/fusion/count_fusion.md | 23 +++ docs/fusion/mkref.md | 24 +++ docs/fusion/multi_fusion.md | 90 ++++++++++ docs/fusion/star_fusion.md | 32 ++++ docs/hla/mapping_hla.md | 15 ++ docs/hla/multi_hla.md | 73 ++++++++ docs/manual.md | 41 +++++ docs/manual_template.md | 23 +++ docs/mut/count_mut.md | 17 ++ docs/mut/mapping_mut.md | 17 ++ docs/mut/multi_mut.md | 81 +++++++++ docs/quick_start.md | 110 ++++++++++++ docs/rna/analysis.md | 51 ++++++ docs/rna/mkref.md | 38 ++++ docs/rna/multi_rna.md | 106 +++++++++++ docs/rna/star.md | 56 ++++++ docs/rna_virus/analysis_rna_virus.md | 17 ++ docs/rna_virus/count_virus.md | 17 ++ docs/rna_virus/multi_rna_virus.md | 92 ++++++++++ docs/rna_virus/star_virus.md | 32 ++++ docs/snp/analysis_snp.md | 23 +++ docs/snp/mkref.md | 29 +++ docs/snp/multi_snp.md | 97 ++++++++++ docs/snp/variant_calling.md | 38 ++++ docs/tag/analysis_tag.md | 19 ++ docs/tag/count_tag.md | 44 +++++ docs/tag/mapping_tag.md | 48 +++++ docs/tag/multi_tag.md | 108 +++++++++++ docs/tag/split_tag.md | 26 +++ docs/tcr_fl/assemble.md | 15 ++ docs/tcr_fl/multi_tcr_fl.md | 79 ++++++++ docs/tcr_fl/split_fq.md | 15 ++ docs/tools/barcode.md | 61 +++++++ docs/tools/consensus.md | 24 +++ docs/tools/count.md | 61 +++++++ docs/tools/cutadapt.md | 44 +++++ docs/tools/featureCounts.md | 38 ++++ docs/tools/sample.md | 17 ++ docs/tools/target_metrics.md | 28 +++ docs/vdj/count_vdj.md | 37 ++++ docs/vdj/mapping_vdj.md | 35 ++++ docs/vdj/multi_vdj.md | 84 +++++++++ generate_docs.py | 71 ++++++-- 62 files changed, 2949 insertions(+), 19 deletions(-) create mode 100755 docs/CHANGELOG.md create mode 100644 docs/CONTRIBUTING.md create mode 100644 docs/capture_rna/count_capture_rna.md create mode 100644 docs/capture_rna/multi_capture_rna.md create mode 100644 docs/capture_virus/analysis_capture_virus.md create mode 100644 docs/capture_virus/count_capture_virus.md create mode 100644 docs/capture_virus/mkref.md create mode 100644 docs/capture_virus/multi_capture_virus.md create mode 100644 docs/citeseq/analysis_cite.md create mode 100644 docs/citeseq/count_cite.md create mode 100644 docs/citeseq/multi_citeseq.md create mode 100644 docs/dynaseq/conversion.md create mode 100644 docs/dynaseq/multi_dynaseq.md create mode 100644 docs/dynaseq/replace_tsne.md create mode 100644 docs/dynaseq/replacement.md create mode 100644 docs/dynaseq/subsitution.md create mode 100644 docs/fusion/count_fusion.md create mode 100644 docs/fusion/mkref.md create mode 100644 docs/fusion/multi_fusion.md create mode 100644 docs/fusion/star_fusion.md create mode 100644 docs/hla/mapping_hla.md create mode 100644 docs/hla/multi_hla.md create mode 100644 docs/manual.md create mode 100644 docs/manual_template.md create mode 100644 docs/mut/count_mut.md create mode 100644 docs/mut/mapping_mut.md create mode 100644 docs/mut/multi_mut.md create mode 100755 docs/quick_start.md create mode 100644 docs/rna/analysis.md create mode 100644 docs/rna/mkref.md create mode 100644 docs/rna/multi_rna.md create mode 100644 docs/rna/star.md create mode 100644 docs/rna_virus/analysis_rna_virus.md create mode 100644 docs/rna_virus/count_virus.md create mode 100644 docs/rna_virus/multi_rna_virus.md create mode 100644 docs/rna_virus/star_virus.md create mode 100644 docs/snp/analysis_snp.md create mode 100644 docs/snp/mkref.md create mode 100644 docs/snp/multi_snp.md create mode 100644 docs/snp/variant_calling.md create mode 100644 docs/tag/analysis_tag.md create mode 100644 docs/tag/count_tag.md create mode 100644 docs/tag/mapping_tag.md create mode 100644 docs/tag/multi_tag.md create mode 100644 docs/tag/split_tag.md create mode 100644 docs/tcr_fl/assemble.md create mode 100644 docs/tcr_fl/multi_tcr_fl.md create mode 100644 docs/tcr_fl/split_fq.md create mode 100644 docs/tools/barcode.md create mode 100644 docs/tools/consensus.md create mode 100644 docs/tools/count.md create mode 100644 docs/tools/cutadapt.md create mode 100644 docs/tools/featureCounts.md create mode 100644 docs/tools/sample.md create mode 100644 docs/tools/target_metrics.md create mode 100644 docs/vdj/count_vdj.md create mode 100644 docs/vdj/mapping_vdj.md create mode 100644 docs/vdj/multi_vdj.md diff --git a/celescope/citeseq/multi_citeseq.py b/celescope/citeseq/multi_citeseq.py index 0ec65bc9..e9b10e67 100755 --- a/celescope/citeseq/multi_citeseq.py +++ b/celescope/citeseq/multi_citeseq.py @@ -1,6 +1,10 @@ +from celescope.tools.multi import Multi -def main(): +class Multi_citeseq(Multi): + pass + +def main(): # TODO pass diff --git a/celescope/hla/multi_hla.py b/celescope/hla/multi_hla.py index 21e44651..802fc981 100755 --- a/celescope/hla/multi_hla.py +++ b/celescope/hla/multi_hla.py @@ -1,3 +1,7 @@ +from celescope.tools.multi import Multi + +class Multi_hla(Multi): + pass def main(): # TODO diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index 69dbc418..b0d3eba1 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -3,6 +3,27 @@ from celescope.tools.multi import Multi class Multi_snp(Multi): + """ + Usage + ``` + multi_snp\ + --mapfile ./test1.mapfile\ + --genomeDir {genomeDir after running celescope snp mkref}\ + --thread 10\ + --mod shell\ + --gene_list gene_list.tsv\ + --annovar_config annovar.config\ + ``` + annovar_config file + ``` + [ANNOVAR] + dir = /Public/Software/annovar/ + db = /SGRNJ/Database/script/database/annovar/humandb + buildver = hg38 + protocol = refGene,cosmic70 + operation = g,f + ``` + """ def star(self, sample): step = 'star' diff --git a/docs/CHANGELOG.md b/docs/CHANGELOG.md new file mode 100755 index 00000000..109dd6d7 --- /dev/null +++ b/docs/CHANGELOG.md @@ -0,0 +1,178 @@ +# Change Log + +## [unreleased] - 2021-06-09 +### Added + +### Changed + +### Fixed +- `celescope.tools.count` will report an error when there are multiple gtf or refFlat file under `genomeDir`. + +### Removed +- `celescope.tools.utils.glob_genomeDir` + +## [1.3.1] - 2021-06-09 +### Added + +- Add wdl workflow. + +- Add Seurat hashtag method in `celescope tag count_tag`. To get Seurat hashtag output, use `--debug`. However, there was a unsolved problem with this method: https://github.com/satijalab/seurat/issues/2549. + +### Changed + +- `{sample}_UMI_count_filtered1.tsv` in mapping_vdj changed to `{sample}_UMI_count_filtered.tsv` (remove `1` after filtered) + +### Fixed and Removed + +- Remove h5 file generation in R to avoid memory issues. + + +## [1.3.0] - 2021-05-28 + +### Added + +- `mkref` subcommand. See `celescope rna mkref`, `celescope fusion mkref` and `celescope virus mkref` for details. + +### Changed + +- Change the way to handle duplicate gene_name and gene_id in gtf file. + +Previous: + + - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. + - one gene_id with multiple gene_name: newer gene_name will overwrite older gene_name. + - duplicated (gene_name, gene_id): "_{count}" will be added to gene_name. + +Now: + + - one gene_name with multiple gene_id: "_{count}" will be added to gene_name. + - one gene_id with multiple gene_name: error. + - duplicated (gene_name, gene_id): ignore duplicated records and print a warning. + +### Fixed + +- Fix `count tag` metrics order in merge.xls + +### Removed + +- Remove `--fusion_pos` from `celescope.fusion.count_fusion` + + +## [1.2.0] - 2021-05-19 + +### Added + +- Assay `rna` outputs .h5 file in 06.analysis directory. + +### Changed + +- Update Seurat from 2.3.4 to 4.0.1. + +- `--genomeDir` in `celescope.fusion.star_fusion` changed to `--fusion_genomeDir` to avoid misunderstanding. + +- Step `star` sort bam by samtools instead of STAR to avoid potential `not enough memory for BAM sorting` error: https://github.com/alexdobin/STAR/issues/1136 + +### Removed + +- Assay `rna` no longer outputs tab-delimited expression matrix file in 05.count directory. + + +## [1.1.9] - 2021-04-25 + +### Added + +- Add parameter `--coefficient` to `celescope tag count_tag` and `multi_tag` + + Default `0.1`. Minimum signal-to-noise ratio is calulated as `SNR_min = max(median(SNRs) * coefficient, 2)` + +- Add `.metrics.json` + +- Add `scopeV1` chemistry support. + +### Changed + +- Optimize speed and memory usage of step `barcode`(~2X faster) and `celescope.tools.count.downsample`(~15-25X faster, 1/2 memory usage). + +- Change filtering of linker from allowing two mismatches in total to two mismatches per segment; this will slightly increase the valid reads percentage. + +- Default output fastq files of `barcode` and `cutadapt` are not gzipped. Use `--gzipped` to get gzipped output. + +- Change the display of Barcode-rank plot in html report. + +### Fixed + +- Fix a bug that `celescope.tools.barcode.mismatch` cannot output all sequences correctly when n_mismatch>=2. + +- Fix an error when Numpy >= 1.2.0. + +- VDJ merge.xls can display all the metrics correctly. + +### Removed + +- Remove fastqc from `barcode` step. + + +## [1.1.8] - 2021-03-26 + +### Added + +- Add read consensus to VDJ pipeline. + + A consensus step was added before mapping to merge all the reads of the same + (barcode, UMI) into one UMI. For defailed consensus algorithm, refer to `celescope.tools.consensus`. + multi_vdj adds the parameter `--not_consensus` that you can skip the consensus step, and get the same results as v1.1.7. + +- Add parameter `--species` to `celescope vdj mapping_vdj` and `multi_vdj`. + + `--species` can be one of: + - `hs`: human + - `mmu`: mouse + +- Add parameter `--cell_calling_method` to `celescope rna count` and `multi_rna`. + + `--cell_calling_method` can be one of: + - `auto`: Same result as v1.1.7. + - `cellranger3`: Refer to the cell_calling algorithm of cellranger3, and the result is similar to cellranger3. + - `reflection`: Use the inflection point of the barcode-rank curve as the UMI threshold. The minimum UMI value is changed from initial threshold / 10 to initial threshold / 2 to prevent the use of a lower inflection point when there are multiple inflection points. + +- Add 4 tags to featureCounts bam. + + - `CB`: cell barcode + - `UB`: UMI + - `GN`: gene name + - `GX`: gene id + +- Add `--STAR_param` to `celescope rna STAR` + + Additional parameters of STAR can be passed into the `STAR` step. + +### Changed + +- One sample can have different chemistry fastq in mapfile. Version <= v1.1.7 will report this as an error. + +- Gtf file can be gzipped. + +- `multi_rna` can use 3 paramters: `--STAR_index`, `--gtf` and `--refFlat` instead of `--genomeDir` + +- Step `snpCalling` use mutract. + + +## [1.1.7] - 2020-12-16 + +### Added + +- Automatically detect Singleron chemistry version. + +### Changed + +- FeatureCounts use strand specificity. + +- Cutadapt default `overlap` change from `5` to `10`. + +- VDJ sort `NA` last. + +- `match clonetypes` are sorted by barcode_count(Frequency) first, then clonetype_ID. + + + + diff --git a/docs/CONTRIBUTING.md b/docs/CONTRIBUTING.md new file mode 100644 index 00000000..fb9ee986 --- /dev/null +++ b/docs/CONTRIBUTING.md @@ -0,0 +1,106 @@ +## Pull Requests +Create pull requests to `dev` branch + +## Lint code +Before pull requests, you should lint your code with the following command: +``` +pip install pylint +# lint +# W1618 (no-absolute-import) +# E1101 (no-member) +# W1633 (round-builtin) +# W1619 (old-division) +# W0105 (String statement has no effect) +# W0511 TODO! +# E1130 bad operand type for unary ~: _isnan (invalid-unary-operand-type) +# W0212 Access to a protected member _option_string_actions of a client class (protected-access) +pylint --disable=all --enable=E,W --disable=W1618,E1101,W1633,W1619,W0105,W0511,E1130,W0212 --jobs=8 celescope +``` +Your code should be rated at 10(i.e. no error or warning). + +## Write a new step +When you add a new step, you need to + - Create a step class which inherit from `celescope.tools.step.Step`. + - Create a function with the same name of the module. The main function `celescope` uses this function to run each step. + - Create a parser function with the name `get_opts_{module_name}`. `celescope` command line interface uses this function. The `sub_program` argument in this function hides all the arguments that you do not want to show in the `multi_{assay}` interface. + +For example, in `celescope.tools.cutadapt`: + +``` +from celescope.tools.step import Step, s_common +import celescope.tools.utils as utils + + +class Cutadapt(Step): + """ + Features + - Trim adapters in R2 reads with cutadapt. Default adapters includes: + - polyT=A{18}, 18 A bases. + - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. + + Output + - `cutadapt.log` Cutadapt output log file. + - `{sample}_clean_2.fq.gz` R2 reads file without adapters. + """ + + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + {some init code} + + @utils.add_log + def run(self): + {some code to run} + + +@utils.add_log +def cutadapt(args): + step_name = "cutadapt" + cutadapt_obj = Cutadapt(args, step_name) + cutadapt_obj.run() + + +def get_opts_cutadapt(parser, sub_program): + parser.add_argument('--adapter_fasta', help='Addtional adapter fasta file.') + parser.add_argument( + '--minimum_length', + help='Default `20`. Discard processed reads that are shorter than LENGTH.', + default=20 + ) + {other arguments} + if sub_program: + parser.add_argument('--fq', help='Required. R2 reads from step Barcode.', required=True) + parser.add_argument('--gzip', help="Output gzipped fastq", action='store_true') + parser = s_common(parser) + return parser +``` + +## Docs +There is a python script at the root of this repo `generate_docs.py` to generate documents for each released step. The generated docs are in the `docs` folder. It will collect: +- Docstring of the step class. The Docstring should have sections named `Features` and `Output`. +- Help infomation in `get_opts_{module_name}` + +Released assays will be added to `manual.md`. + +## Tests +If you add new steps, you need to create a small data for integration tests. There is a test example in `celescope/tests/test_multi.py`. To run this example: + + +1. Get test data +``` +# If you have access to Singleron Nanjing HPC +copy -r /SGRNJ03/randd/user/zhouyiqi/multi_tests/test_folder {test_dir} +# Or clone from repo +git clone https://github.com/singleron-RD/celescope_tests.git +``` + +2. Run `pytest` +``` +Install pytest +>>> pip install pytest +Run all +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} +Run some tests +>>> pytest -s ./tests/test_multi.py --test_dir {test_dir} --assays rna,tag +``` + +Then you need to create your own test based on this example. \ No newline at end of file diff --git a/docs/capture_rna/count_capture_rna.md b/docs/capture_rna/count_capture_rna.md new file mode 100644 index 00000000..102eb020 --- /dev/null +++ b/docs/capture_rna/count_capture_rna.md @@ -0,0 +1,61 @@ +## Features +- Cell-calling: Distinguish cell barcodes from background barcodes. + +- Generate expression matrix. + +## Output +- `{sample}_all_matrix` The expression matrix of all detected barcodes. + Can be read in by calling the `Seurat::Read10X` function. + +- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. +Can be read in by calling the `Seurat::Read10X` function. + +- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. +CeleScope >=1.2.0 does not output this file. + +- `{sample}_count_detail.txt.gz` 4 columns: + - barcode + - gene ID + - UMI count + - read_count + +- `{sample}_counts.txt` 6 columns: + - Barcode: barcode sequence + - readcount: read count of each barcode + - UMI2: UMI count (with reads per UMI >= 2) for each barcode + - UMI: UMI count for each barcode + - geneID: gene count for each barcode + - mark: cell barcode or backgound barcode. + + `CB` cell + `UB` background + +- `{sample}_downsample.txt` 3 columns: + - percent: percentage of sampled reads + - median_geneNum: median gene number per cell + - saturation: sequencing saturation + +- `barcode_filter_magnitude.pdf` Barcode-UMI plot. + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--bam` Required. BAM file from featureCounts. + +`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%. + diff --git a/docs/capture_rna/multi_capture_rna.md b/docs/capture_rna/multi_capture_rna.md new file mode 100644 index 00000000..a2c6067a --- /dev/null +++ b/docs/capture_rna/multi_capture_rna.md @@ -0,0 +1,106 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--gtf_type` Specify feature type in GTF annotation + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + +`--genomeDir` Required. Genome directory. + +`--save_rds` Write rds to disk. + +`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +``` + diff --git a/docs/capture_virus/analysis_capture_virus.md b/docs/capture_virus/analysis_capture_virus.md new file mode 100644 index 00000000..01aeb0c1 --- /dev/null +++ b/docs/capture_virus/analysis_capture_virus.md @@ -0,0 +1,19 @@ + + +## Arguments +`--umi_threshold` method to find virus UMI threshold + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--match_dir` match_dir + +`--virus_file` virus UMI count file + diff --git a/docs/capture_virus/count_capture_virus.md b/docs/capture_virus/count_capture_virus.md new file mode 100644 index 00000000..2789114d --- /dev/null +++ b/docs/capture_virus/count_capture_virus.md @@ -0,0 +1,19 @@ + + +## Arguments +`--min_query_length` minimum query length + +`--outdir` output dir + +`--assay` assay + +`--sample` sample name + +`--thread` None + +`--debug` debug + +`--match_dir` matched rna_virus directory + +`--virus_bam` None + diff --git a/docs/capture_virus/mkref.md b/docs/capture_virus/mkref.md new file mode 100644 index 00000000..a3627da9 --- /dev/null +++ b/docs/capture_virus/mkref.md @@ -0,0 +1,15 @@ + + +## Arguments +`--genomeDir` Default='./'. Output directory. + +`--thread` Default=6. Threads to use. + +`--genome_name` Required, genome name. + +`--dry_run` Only write config file and exit. + +`--fasta` virus fasta file + +`--genomeSAindexNbases` STAR genomeSAindexNbases + diff --git a/docs/capture_virus/multi_capture_virus.md b/docs/capture_virus/multi_capture_virus.md new file mode 100644 index 00000000..95c3421d --- /dev/null +++ b/docs/capture_virus/multi_capture_virus.md @@ -0,0 +1,94 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--threshold` Default 0.5. Valid base threshold. + +`--not_consensus` Skip the consensus step. + +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--virus_genomeDir` virus genome dir + +`--min_query_length` minimum query length + +`--umi_threshold` method to find virus UMI threshold + diff --git a/docs/citeseq/analysis_cite.md b/docs/citeseq/analysis_cite.md new file mode 100644 index 00000000..df9b21da --- /dev/null +++ b/docs/citeseq/analysis_cite.md @@ -0,0 +1,13 @@ + + +## Arguments +`--outdir` output dir + +`--sample` sample name + +`--match_dir` match_dir + +`--citeseq_mtx` citeseq matrix .gz file + +`--assay` assay + diff --git a/docs/citeseq/count_cite.md b/docs/citeseq/count_cite.md new file mode 100644 index 00000000..55cbeaf4 --- /dev/null +++ b/docs/citeseq/count_cite.md @@ -0,0 +1,13 @@ + + +## Arguments +`--match_dir` matched scRNA-Seq CeleScope directory path + +`--outdir` output dir + +`--sample` sample name + +`--assay` assay + +`--read_count_file` tag read count file + diff --git a/docs/citeseq/multi_citeseq.md b/docs/citeseq/multi_citeseq.md new file mode 100644 index 00000000..6674538f --- /dev/null +++ b/docs/citeseq/multi_citeseq.md @@ -0,0 +1,95 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. +`L` linker(common sequences) +`C` tag barcode + +`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode +sequence in R2 reads with all tag barcode sequence in barcode_fasta. +It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. +If no such tag exists, the read is classified as invalid. +``` +>tag_0 +GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC +>tag_1 +TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG +>tag_2 +AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA +>tag_3 +CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG +``` + +`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads +with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid. + +`--match_dir` matched scRNA-Seq CeleScope directory path + diff --git a/docs/dynaseq/conversion.md b/docs/dynaseq/conversion.md new file mode 100644 index 00000000..bfb0cb2a --- /dev/null +++ b/docs/dynaseq/conversion.md @@ -0,0 +1,26 @@ +## Features +- Get conversion pos in each read. + - Get snp info. + +## Output +- `{sample}.PosTag.bam` Bam file with conversion info. +- `{sample}.PosTag.csv` SNP info in csv format. + + +## Arguments +`--strand` gene strand file + +`--bam` featureCount bam + +`--cell` barcode cell list + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/multi_dynaseq.md b/docs/dynaseq/multi_dynaseq.md new file mode 100644 index 00000000..b84dcd21 --- /dev/null +++ b/docs/dynaseq/multi_dynaseq.md @@ -0,0 +1,110 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--gtf_type` Specify feature type in GTF annotation + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + +`--genomeDir` Required. Genome directory. + +`--save_rds` Write rds to disk. + +`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +``` + +`--strand` gene strand file + +`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format + diff --git a/docs/dynaseq/replace_tsne.md b/docs/dynaseq/replace_tsne.md new file mode 100644 index 00000000..31ed90ce --- /dev/null +++ b/docs/dynaseq/replace_tsne.md @@ -0,0 +1,30 @@ +## Features +- Replace rate in each cluster +- Top replace genes in each cluster + +## Output +- `{sample}.rep_in_tsne.txt` Replace rate in each cluster. +- `{sample}.rep_in_tsne_top10` Top 10 replace genes in each cluster. + + +## Arguments +`--tsne` tsne file + +`--mat` matrix rep file + +`--rep` cell rep file + +`--mincell` turn-over in at least cells, default 5 + +`--topgene` top N genes,default 10 + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/replacement.md b/docs/dynaseq/replacement.md new file mode 100644 index 00000000..1184777c --- /dev/null +++ b/docs/dynaseq/replacement.md @@ -0,0 +1,36 @@ +## Features +- Computes the replacement rates in each cell and gene. +- Boxplots for rates distribution. + +## Output +- `{sample}.TC_matrix.rds` New and old info for each barcode/gene/umi. +- `{sample}.new_matrix.tsv.gz` New RNA matrix. +- `{sample}.old_matrix.tsv.gz` Old RNA matrix. +- `{sample}.fraction_of_newRNA_per_cell.txt` Fraction of new RNA of each cell. +- `{sample}.fraction_of_newRNA_per_gene.txt` Fraction of new RNA of each gene. +- `{sample}.fraction_of_newRNA_matrix.txt` Fraction of new RNA of each cell and gene. + + +## Arguments +`--bg_cov` background snp depth filter, lower than bg_cov will be discarded. Only valid in csv format + +`--bam` bam file + +`--bg` background snp file + +`--cell_keep` filter cell + +`--min_cell` a gene expressed in at least cells, default 10 + +`--min_gene` at least gene num in a cell, default 10 + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/dynaseq/subsitution.md b/docs/dynaseq/subsitution.md new file mode 100644 index 00000000..e2b7b169 --- /dev/null +++ b/docs/dynaseq/subsitution.md @@ -0,0 +1,20 @@ +## Features +- Computes the overall conversion rates in reads and plots a barplot. + +## Output +- `{sample}.substitution.txt` Tab-separated table of the overall conversion rates. + + +## Arguments +`--bam` bam file + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/fusion/count_fusion.md b/docs/fusion/count_fusion.md new file mode 100644 index 00000000..39063118 --- /dev/null +++ b/docs/fusion/count_fusion.md @@ -0,0 +1,23 @@ + + +## Arguments +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--bam` None + +`--match_dir` match scRNA-Seq dir + +`--fusion_genomeDir` fusion genome directory + +`--flanking_base` None + +`--UMI_min` None + diff --git a/docs/fusion/mkref.md b/docs/fusion/mkref.md new file mode 100644 index 00000000..24ca414f --- /dev/null +++ b/docs/fusion/mkref.md @@ -0,0 +1,24 @@ + + +## Arguments +`--genomeDir` Default='./'. Output directory. + +`--thread` Default=6. Threads to use. + +`--genome_name` Required, genome name. + +`--dry_run` Only write config file and exit. + +`--fasta` fusion fasta file + +`--fusion_pos` fusion position file. A two column tab-delimited text file with header. +"pos" is the end postion of the first gene(1-based). +e.g. +tag pos +PML_3 183 +PML_4 254 +PML_5 326 +PML_6 204 + +`--genomeSAindexNbases` STAR genomeSAindexNbases + diff --git a/docs/fusion/multi_fusion.md b/docs/fusion/multi_fusion.md new file mode 100644 index 00000000..291a1756 --- /dev/null +++ b/docs/fusion/multi_fusion.md @@ -0,0 +1,90 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--fusion_genomeDir` fusion genome directory + +`--flanking_base` None + +`--UMI_min` None + diff --git a/docs/fusion/star_fusion.md b/docs/fusion/star_fusion.md new file mode 100644 index 00000000..4ac60c75 --- /dev/null +++ b/docs/fusion/star_fusion.md @@ -0,0 +1,32 @@ + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--fq` Required. R2 fastq file. + +`--consensus_fq` Input fastq has been consensused + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--fusion_genomeDir` fusion gene STAR index genome directory + diff --git a/docs/hla/mapping_hla.md b/docs/hla/mapping_hla.md new file mode 100644 index 00000000..d4f5f1e3 --- /dev/null +++ b/docs/hla/mapping_hla.md @@ -0,0 +1,15 @@ + + +## Arguments +`--outdir` output dir + +`--sample` sample name + +`--fq` None + +`--assay` assay + +`--match_dir` match scRNA-Seq dir + +`--thread` number of thread + diff --git a/docs/hla/multi_hla.md b/docs/hla/multi_hla.md new file mode 100644 index 00000000..62c19f1b --- /dev/null +++ b/docs/hla/multi_hla.md @@ -0,0 +1,73 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--match_dir` match scRNA-Seq dir + +`--thread` number of thread + diff --git a/docs/manual.md b/docs/manual.md new file mode 100644 index 00000000..b39281ce --- /dev/null +++ b/docs/manual.md @@ -0,0 +1,41 @@ +## Introduction +CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria. + +Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming. + +Currently, CeleScope includes the follwing pipelines: + +- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional). + +- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting. + +- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification. + + +## [Quick start](quick_start.md) + +## [Change log](CHANGELOG.md) + +## Pre-processing + +- [barcode](tools/barcode.md) +- [cutadapt](tools/cutadapt.md) + +## Single-cell rna +- [mkref](rna/mkref.md) +- [star](rna/star.md) +- [featureCounts](tools/featureCounts.md) +- [count](tools/count.md) +- [analysis](rna/analysis.md) +- [multi_rna](rna/multi_rna.md) +## Single-cell vdj +- [consensus](tools/consensus.md) +- [mapping_vdj](vdj/mapping_vdj.md) +- [count_vdj](vdj/count_vdj.md) +- [multi_vdj](vdj/multi_vdj.md) +## Single-cell tag +- [mapping_tag](tag/mapping_tag.md) +- [count_tag](tag/count_tag.md) +- [analysis_tag](tag/analysis_tag.md) +- [split_tag](tag/split_tag.md) +- [multi_tag](tag/multi_tag.md) diff --git a/docs/manual_template.md b/docs/manual_template.md new file mode 100644 index 00000000..c524de94 --- /dev/null +++ b/docs/manual_template.md @@ -0,0 +1,23 @@ +## Introduction +CeleScope is a collection of bioinfomatics analysis pipelines developed at Singleron to process single cell sequencing data generated with Singleron products. These pipelines take paired-end FASTQ files as input and generate output files which can be used for downstream data analysis as well as a summary of QC criteria. + +Each pipeline consists of several steps and they all have two identical pre-processing steps: `barcode` and `cutadapt`. `barcode`step is used for barcode demupltiplexing, correction and read filtering. `cutadapt`step calls [Cutadapt](https://cutadapt.readthedocs.io/en/stable/) for read trimming. + +Currently, CeleScope includes the follwing pipelines: + +- `celescope rna` for Single-cell RNA-seq data generated with GEXSCOPE kits. It performs preprocessing, genome alignment, feature counting, expression matrix generation, clustering, marker gene expression analysis and cell type assignment(optional). + +- `celescope vdj` for Single-cell Immune Repertoire data generated with GEXSCOPE IR kits. It performs preprocessing, UMI consensus, vdj sequence alignment, UMI filtering and clonetypes counting. + +- `celescope tag` for Single-cell Multiplexing data generated with CLindex Sample Multiplexing kits. It performs preprocessing, tag counting, tag assignment and multiplets identification. + + +## [Quick start](quick_start.md) + +## [Change log](CHANGELOG.md) + +## Pre-processing + +- [barcode](tools/barcode.md) +- [cutadapt](tools/cutadapt.md) + diff --git a/docs/mut/count_mut.md b/docs/mut/count_mut.md new file mode 100644 index 00000000..38699f30 --- /dev/null +++ b/docs/mut/count_mut.md @@ -0,0 +1,17 @@ + + +## Arguments +`--outdir` output dir + +`--sample` sample name + +`--bam` None + +`--assay` assay + +`--mut_file` mutation file + +`--match_dir` match scRNA-Seq dir + +`--shift_base` None + diff --git a/docs/mut/mapping_mut.md b/docs/mut/mapping_mut.md new file mode 100644 index 00000000..afccd1b6 --- /dev/null +++ b/docs/mut/mapping_mut.md @@ -0,0 +1,17 @@ + + +## Arguments +`--outdir` output dir + +`--sample` sample name + +`--fq` None + +`--assay` assay + +`--indel_genomeDir` insertion or deletion STAR indexed genome directory + +`--thread` STAR thread + +`--outFilterMatchNmin` STAR outFilterMatchNmin + diff --git a/docs/mut/multi_mut.md b/docs/mut/multi_mut.md new file mode 100644 index 00000000..23111560 --- /dev/null +++ b/docs/mut/multi_mut.md @@ -0,0 +1,81 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--indel_genomeDir` insertion or deletion STAR indexed genome directory + +`--thread` STAR thread + +`--outFilterMatchNmin` STAR outFilterMatchNmin + +`--mut_file` mutation file + +`--match_dir` match scRNA-Seq dir + +`--shift_base` None + diff --git a/docs/quick_start.md b/docs/quick_start.md new file mode 100755 index 00000000..57d2327f --- /dev/null +++ b/docs/quick_start.md @@ -0,0 +1,110 @@ +# Quick start + +CeleScope contains interfaces `multi_{assay}` to generate pipeline scripts for all assays. Assays can be one of: + +- rna +- vdj +- tag + +Run `multi_{assay} -h` for help. + + +## Usage Example + +- Single-cell rna + + ``` + conda activate celescope + multi_rna\ + --mapfile ./rna.mapfile\ + --genomeDir /SGRNJ/Public/Database/genome/homo_mus\ + --thread 8\ + --mod shell + ``` +`--mapfile` Required. Mapfile path. + +`--genomeDir` Required. Required. Genome directory. + +`--thread` The recommended setting is 8, and the maximum should not exceed 20. + +`--mod` Create `sjm`(simple job manager https://github.com/StanfordBioinformatics/SJM) or `shell` scripts. + +Scripts above will generate a `shell` directory containing `{sample}.sh` files. + +You can start your analysis by running: +``` +sh ./shell/{sample}.sh +``` + +- Single cell vdj + +``` +conda activate celescope +multi_vdj \ + --mapfile ./vdj.mapfile \ + --type TCR \ + --thread 8 \ + --mod shell +``` + +`--type` Required. TCR or BCR. + +- Single cell tag + +``` +conda activate celescope +multi_tag \ + --mapfile ./tag.mapfile\ + --barcode_fasta ./smk_barcode.fa\ + --fq_pattern L25C45\ + --mod shell +``` + +`--barcode_fasta` Required. Tag barcode fasta file. +``` +>tag_0 +GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC +>tag_1 +TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG +>tag_2 +AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA +>tag_3 +CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG +``` + +`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. + +`L` linker(common sequences) +`C` tag barcode + +## How to write mapfile + +Mapfile is a tab-delimited text file with as least three columns. Each line of mapfile represents paired-end fastq files. + +1st column: Fastq file prefix. +2nd column: Fastq file directory path. +3rd column: Sample name, which is the prefix of all output files. +4th column: The 4th column has different meaning for each assay. The single cell rna directory after running CeleScope is called `matched_dir`. +- `rna` Optional, forced cell number. +- `vdj` Optional, matched_dir. +- `tag` Required, matched_dir. + +### Example + +Sample1 has 2 paired-end fastq files located in 2 different directories(fastq_dir1 and fastq_dir2). Sample2 has 1 paired-end fastq file located in fastq_dir1. +``` +$cat ./my.mapfile +fastq_prefix1 fastq_dir1 sample1 +fastq_prefix2 fastq_dir2 sample1 +fastq_prefix3 fastq_dir1 sample2 + +$ls fastq_dir1 +fastq_prefix1_1.fq.gz fastq_prefix1_2.fq.gz +fastq_prefix3_1.fq.gz fastq_prefix3_2.fq.gz + +$ls fastq_dir2 +fastq_prefix2_1.fq.gz fastq_prefix2_2.fq.gz +``` + + + diff --git a/docs/rna/analysis.md b/docs/rna/analysis.md new file mode 100644 index 00000000..9ddfd1b3 --- /dev/null +++ b/docs/rna/analysis.md @@ -0,0 +1,51 @@ +## Features +- Cell clustering with Seurat. + +- Calculate the marker gene of each cluster. + +- Cell type annotation(optional). You can provide markers of known cell types and annotate cell types for each cluster. + +## Output +- `markers.tsv` Marker genes of each cluster. + +- `tsne_coord.tsv` t-SNE coordinates and clustering information. + +- `{sample}/06.analsis/{sample}_auto_assign/` This result will only be obtained when `--type_marker_tsv` +parameter is provided. The result contains 3 files: + - `{sample}_auto_cluster_type.tsv` The cell type of each cluster; if cell_type is "NA", +it means that the given marker is not enough to identify the cluster. + - `{sample}_png/{cluster}_pctdiff.png` Percentage of marker gene expression in this cluster - percentage in all other clusters. + - `{sample}_png/{cluster}_logfc.png` log2 (average expression of marker gene in this cluster / average expression in all other clusters + 1) + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--save_rds` Write rds to disk. + +`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +``` + +`--matrix_file` Required. Matrix_10X directory from step count. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/rna/mkref.md b/docs/rna/mkref.md new file mode 100644 index 00000000..c1b3d592 --- /dev/null +++ b/docs/rna/mkref.md @@ -0,0 +1,38 @@ +## Features +- Create a genome reference directory. + +## Output + +- STAR genome index files + +- Genome refFlat file + +- Genome config file +``` +$ cat celescope_genome.config +[genome] +genome_name = Homo_sapiens_ensembl_99 +genome_type = rna +fasta = Homo_sapiens.GRCh38.dna.primary_assembly.fa +gtf = Homo_sapiens.GRCh38.99.gtf +refflat = Homo_sapiens_ensembl_99.refFlat +``` + + +## Arguments +`--genomeDir` Default='./'. Output directory. + +`--thread` Default=6. Threads to use. + +`--genome_name` Required, genome name. + +`--dry_run` Only write config file and exit. + +`--fasta` Required. Genome fasta file. Must be relative file path to genomeDir. + +`--gtf` Required. Genome gtf file. Must be relative file path to genomeDir. + +`--mt_gene_list` Mitochondria gene list file. Must be relative file path to genomeDir. +It is a plain text file with one gene per line. +If not provided, will use `MT-` and `mt-` to determine mitochondria genes. + diff --git a/docs/rna/multi_rna.md b/docs/rna/multi_rna.md new file mode 100644 index 00000000..a2c6067a --- /dev/null +++ b/docs/rna/multi_rna.md @@ -0,0 +1,106 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--gtf_type` Specify feature type in GTF annotation + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + +`--genomeDir` Required. Genome directory. + +`--save_rds` Write rds to disk. + +`--type_marker_tsv` A tsv file with header. If this parameter is provided, cell type will be annotated. Example: +``` +cell_type marker +Alveolar "CLDN18,FOLR1,AQP4,PEBP4" +Endothelial "CLDN5,FLT1,CDH5,RAMP2" +Epithelial "CAPS,TMEM190,PIFO,SNTN" +Fibroblast "COL1A1,DCN,COL1A2,C1R" +B_cell "CD79A,IGKC,IGLC3,IGHG3" +Myeloid "LYZ,MARCO,FCGR3A" +T_cell "CD3D,TRBC1,TRBC2,TRAC" +LUAD "NKX2-1,NAPSA,EPCAM" +LUSC "TP63,KRT5,KRT6A,KRT6B,EPCAM" +``` + diff --git a/docs/rna/star.md b/docs/rna/star.md new file mode 100644 index 00000000..ec3b5211 --- /dev/null +++ b/docs/rna/star.md @@ -0,0 +1,56 @@ +## Features +- Align R2 reads to the reference genome with STAR. +- Collect Metrics with Picard. + +## Output +- `{sample}_Aligned.sortedByCoord.out.bam` BAM file contains Uniquely Mapped Reads. + +- `{sample}_SJ.out.tab` SJ.out.tab contains high confidence collapsed splice junctions in tab-delimited format. + +- `{sample}_Log.out` Main log with a lot of detailed information about the run. +This is most useful for troubleshooting and debugging. + +- `{sample}_Log.progress.out` Report job progress statistics, such as the number of processed reads, +% of mapped reads etc. It is updated in 1 minute intervals. + +- `{sample}_Log.Log.final.out` Summary mapping statistics after mapping job is complete, +very useful for quality control. The statistics are calculated for each read (single- or paired-end) and +then summed or averaged over all reads. Note that STAR counts a paired-end read as one read, +(unlike the samtools agstat/idxstats, which count each mate separately). +Most of the information is collected about the UNIQUE mappers +(unlike samtools agstat/idxstats which does not separate unique or multi-mappers). +Each splicing is counted in the numbers of splices, which would correspond to +summing the counts in SJ.out.tab. The mismatch/indel error rates are calculated on a per base basis, +i.e. as total number of mismatches/indels in all unique mappers divided by the total number of mapped bases. + +- `{sample}_region.log` Picard CollectRnaSeqMetrics results. + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--fq` Required. R2 fastq file. + +`--consensus_fq` Input fastq has been consensused + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/rna_virus/analysis_rna_virus.md b/docs/rna_virus/analysis_rna_virus.md new file mode 100644 index 00000000..8893b4e1 --- /dev/null +++ b/docs/rna_virus/analysis_rna_virus.md @@ -0,0 +1,17 @@ + + +## Arguments +`--outdir` output dir + +`--assay` assay + +`--sample` sample name + +`--thread` None + +`--debug` debug + +`--matrix_file` matrix file + +`--virus_file` virus UMI count file + diff --git a/docs/rna_virus/count_virus.md b/docs/rna_virus/count_virus.md new file mode 100644 index 00000000..60f09d5c --- /dev/null +++ b/docs/rna_virus/count_virus.md @@ -0,0 +1,17 @@ + + +## Arguments +`--outdir` output dir + +`--assay` assay + +`--sample` sample name + +`--thread` None + +`--debug` debug + +`--virus_bam` None + +`--barcode_file` None + diff --git a/docs/rna_virus/multi_rna_virus.md b/docs/rna_virus/multi_rna_virus.md new file mode 100644 index 00000000..26bf08ff --- /dev/null +++ b/docs/rna_virus/multi_rna_virus.md @@ -0,0 +1,92 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--virus_genomeDir` virus genome dir + +`--gtf_type` Specify feature type in GTF annotation + +`--genomeDir` Required. Genome directory. + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + diff --git a/docs/rna_virus/star_virus.md b/docs/rna_virus/star_virus.md new file mode 100644 index 00000000..7ef14bd0 --- /dev/null +++ b/docs/rna_virus/star_virus.md @@ -0,0 +1,32 @@ + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--fq` Required. R2 fastq file. + +`--consensus_fq` Input fastq has been consensused + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--virus_genomeDir` virus genome dir + diff --git a/docs/snp/analysis_snp.md b/docs/snp/analysis_snp.md new file mode 100644 index 00000000..fb2bd136 --- /dev/null +++ b/docs/snp/analysis_snp.md @@ -0,0 +1,23 @@ + + +## Arguments +`--annovar_config` annovar soft config file + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--match_dir` match_dir + +`--vcf` vcf file + +`--CID_file` CID_file + +`--variant_count_file` variant count file + diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md new file mode 100644 index 00000000..34858a56 --- /dev/null +++ b/docs/snp/mkref.md @@ -0,0 +1,29 @@ +## Features +- Create dictionary file and fasta index for gatk SplitNCigarReads. +(https://gatk.broadinstitute.org/hc/en-us/articles/360035531652-FASTA-Reference-genome-format) +Need to run `celescope rna mkref` first + +## Output +- fasta index +- gatk dictionary file + +## Usage +``` +# run celescope rna mkref first +celescope snp mkref \ + --genome_name Homo_sapiens_ensembl_99 \ + --fasta Homo_sapiens.GRCh38.dna.primary_assembly.fa +``` + + +## Arguments +`--genomeDir` Default='./'. Output directory. + +`--thread` Default=6. Threads to use. + +`--genome_name` Required, genome name. + +`--dry_run` Only write config file and exit. + +`--fasta` fasta file + diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md new file mode 100644 index 00000000..ee762e9a --- /dev/null +++ b/docs/snp/multi_snp.md @@ -0,0 +1,97 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--threshold` Default 0.5. Valid base threshold. + +`--not_consensus` Skip the consensus step. + +`--outFilterMatchNmin` Default `0`. Alignment will be output only if the number of matched bases +is higher than or equal to this value. + +`--out_unmapped` Output unmapped reads + +`--STAR_param` Other STAR parameters + +`--outFilterMultimapNmax` Default `1`. How many places are allowed to match a read at most. + +`--starMem` Default `30`. Maximum memory that STAR can use. + +`--gtf_type` Specify feature type in GTF annotation + +`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. + +`--genomeDir` Genome directory after running `mkref`. + +`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level +and use these variants as input vcf. + +`--annovar_config` annovar soft config file + diff --git a/docs/snp/variant_calling.md b/docs/snp/variant_calling.md new file mode 100644 index 00000000..aed2d6fa --- /dev/null +++ b/docs/snp/variant_calling.md @@ -0,0 +1,38 @@ +## Features +- Perform variant calling. + +## Output + +`{sample}_VID.tsv` A unique numeric ID is assigned for each variant. + +`{sample}_CID.tsv` A unique numeric ID is assigned for each cell. + +`{sample}_variant_count.tsv` Reference and variant supporting reads/UMIs count. + +`{sample}_support.mtx` Support matrix, only high quality bases are considered. +0 : no reads/UMIs cover the position. +1 : all reads/UMIs at the position support the ref allele. +2 : all reads/UMIs at the position support the alt allele. +3 : one or more reads/UMIs support both the alt and the ref allele. + + +## Arguments +`--genomeDir` Genome directory after running `mkref`. + +`--vcf` VCF file. If vcf file is not provided, celescope will perform variant calling at single cell level +and use these variants as input vcf. + +`--bam` Input BAM file from step `target_metrics`. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tag/analysis_tag.md b/docs/tag/analysis_tag.md new file mode 100644 index 00000000..da0f476e --- /dev/null +++ b/docs/tag/analysis_tag.md @@ -0,0 +1,19 @@ +## Features +- Combine scRNA-Seq clustering infromation with tag assignment. + + +## Arguments +`--tsne_tag_file` `{sample}_tsne_tag.tsv` from count_tag. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tag/count_tag.md b/docs/tag/count_tag.md new file mode 100644 index 00000000..815b3eb1 --- /dev/null +++ b/docs/tag/count_tag.md @@ -0,0 +1,44 @@ +## Features +- Assign tag to each cell barcode and summarize. + +## Output + +- `{sample}_umi_tag.tsv` + + `first column` cell barcode + `last column` assigned tag + `columns between first and last` UMI count for each tag + +- `{sample}_tsne_tag.tsv` it is `{sample}_umi_tag.tsv` with t-SNE coordinates, gene_counts and cluster infomation + +- `{sample}_cluster_count.tsv` cell barcode number assigned to *undeterminded*, *multiplet* and *each tag* + + +## Arguments +`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*. + +`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag. + +`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. +Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. + +`--combine_cluster` Conbine cluster tsv file. + +`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as +`SNR_min = max(median(SNRs) * coefficient, 2)`. +Smaller `coefficient` will cause less *multiplet* in the tag assignment. + +`--read_count_file` Tag read count file. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tag/mapping_tag.md b/docs/tag/mapping_tag.md new file mode 100644 index 00000000..1f7ab1ae --- /dev/null +++ b/docs/tag/mapping_tag.md @@ -0,0 +1,48 @@ +## Features +- Align R2 reads to the tag barcode fasta. + +## Output + +- `{sample}_read_count.tsv` tab-delimited text file with 4 columns. + + `barcode` cell barcode + `tag_name` tag name in barcode_fasta + `UMI` UMI sequence + `read_count` read count per UMI + + +## Arguments +`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. +`L` linker(common sequences) +`C` tag barcode + +`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode +sequence in R2 reads with all tag barcode sequence in barcode_fasta. +It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. +If no such tag exists, the read is classified as invalid. +``` +>tag_0 +GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC +>tag_1 +TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG +>tag_2 +AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA +>tag_3 +CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG +``` + +`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads +with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--fq` R2 read fastq. + diff --git a/docs/tag/multi_tag.md b/docs/tag/multi_tag.md new file mode 100644 index 00000000..58414ae6 --- /dev/null +++ b/docs/tag/multi_tag.md @@ -0,0 +1,108 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--fq_pattern` Required. R2 read pattern. The number after the letter represents the number of bases. +`L` linker(common sequences) +`C` tag barcode + +`--barcode_fasta` Required. Tag barcode fasta file. It will check the mismatches between tag barcode +sequence in R2 reads with all tag barcode sequence in barcode_fasta. +It will assign read to the tag with mismatch < len(tag barcode) / 10 + 1. +If no such tag exists, the read is classified as invalid. +``` +>tag_0 +GGGCGTCTGTGACCGCGTGATACTGCATTGTAGACCGCCCAACTC +>tag_1 +TTCCTCCAGAGGAGACCGAGCCGGTCAATTCAGGAGAACGTCCGG +>tag_2 +AGGGCTAGGCGTGTCATTTGGCGAGGTCCTGAGGTCATGGAGCCA +>tag_3 +CACTGGTCATCGACACTGGGAACCTGAGGTGAGTTCGCGCGCAAG +``` + +`--linker_fasta` Optional. If provided, it will check the mismatches between linker sequence in R2 reads +with all linker sequence in linker_fasta. If no mismatch < len(linker) / 10 + 1, the read is classified as invalid. + +`--UMI_min` Default='auto'. Minimum UMI threshold. Cell barcodes with valid UMI < UMI_min are classified as *undeterminded*. + +`--dim` Default=1. Tag dimentions. Usually we use 1-dimentional tag. + +`--SNR_min` Default='auto'. Minimum signal-to-noise ratio. +Cell barcodes with UMI >=UMI_min and SNR < SNR_min are classified as *multiplet*. + +`--combine_cluster` Conbine cluster tsv file. + +`--coefficient` Default=0.1. If `SNR_min` is 'auto', minimum signal-to-noise ratio is calulated as +`SNR_min = max(median(SNRs) * coefficient, 2)`. +Smaller `coefficient` will cause less *multiplet* in the tag assignment. + +`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment. + diff --git a/docs/tag/split_tag.md b/docs/tag/split_tag.md new file mode 100644 index 00000000..5a43f7f8 --- /dev/null +++ b/docs/tag/split_tag.md @@ -0,0 +1,26 @@ +## Features +- Split scRNA-Seq fastq according to tag assignment. + +## Output +- `fastq/{tag}_{1,2}.fq` Fastq files of each tag. + + +## Arguments +`--split_fastq` If used, will split scRNA-Seq fastq file according to tag assignment. + +`--umi_tag_file` UMI tag file. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--R1_read` R1 read path. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tcr_fl/assemble.md b/docs/tcr_fl/assemble.md new file mode 100644 index 00000000..95662af1 --- /dev/null +++ b/docs/tcr_fl/assemble.md @@ -0,0 +1,15 @@ + + +## Arguments +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--fastq_dir` None + diff --git a/docs/tcr_fl/multi_tcr_fl.md b/docs/tcr_fl/multi_tcr_fl.md new file mode 100644 index 00000000..e70531b4 --- /dev/null +++ b/docs/tcr_fl/multi_tcr_fl.md @@ -0,0 +1,79 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--match_dir` match scRNA-Seq dir + +`--nCell` select top N cell + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tcr_fl/split_fq.md b/docs/tcr_fl/split_fq.md new file mode 100644 index 00000000..fe767158 --- /dev/null +++ b/docs/tcr_fl/split_fq.md @@ -0,0 +1,15 @@ + + +## Arguments +`--outdir` output dir + +`--sample` sample name + +`--fq` None + +`--assay` assay + +`--match_dir` match scRNA-Seq dir + +`--nCell` select top N cell + diff --git a/docs/tools/barcode.md b/docs/tools/barcode.md new file mode 100644 index 00000000..9f31bd94 --- /dev/null +++ b/docs/tools/barcode.md @@ -0,0 +1,61 @@ +## Features + +- Demultiplex barcodes. +- Filter invalid R1 reads, which includes: + - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2. + - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1. + - Reads without polyT: the number of T bases in the defined polyT region is less than 10. + - Low quality reads: low sequencing quality in barcode and UMI regions. + +## Output + +- `01.barcode/{sample}_2.fq(.gz)` Demultiplexed R2 reads. Barcode and UMI are contained in the read name. The format of +the read name is `{barcode}_{UMI}_{read ID}`. + + +## Arguments +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--fq1` R1 fastq file. Multiple files are separated by comma. + +`--fq2` R2 fastq file. Multiple files are separated by comma. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tools/consensus.md b/docs/tools/consensus.md new file mode 100644 index 00000000..77e11286 --- /dev/null +++ b/docs/tools/consensus.md @@ -0,0 +1,24 @@ +## Features +- Consensus all the reads of the same (barcode, UMI) combinations into one read(UMI). + +## Output +- `{sample}_consensus.fq` Consensus fastq. + + +## Arguments +`--threshold` Default 0.5. Valid base threshold. + +`--not_consensus` Skip the consensus step. + +`--fq` Required. Fastq file. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tools/count.md b/docs/tools/count.md new file mode 100644 index 00000000..102eb020 --- /dev/null +++ b/docs/tools/count.md @@ -0,0 +1,61 @@ +## Features +- Cell-calling: Distinguish cell barcodes from background barcodes. + +- Generate expression matrix. + +## Output +- `{sample}_all_matrix` The expression matrix of all detected barcodes. + Can be read in by calling the `Seurat::Read10X` function. + +- `{sample}_matrix_10X` The expression matrix of the barcode that is identified to be the cell. +Can be read in by calling the `Seurat::Read10X` function. + +- `{sample}_matrix.tsv.gz` The expression matrix of the barcode that is identified to be the cell, separated by tabs. +CeleScope >=1.2.0 does not output this file. + +- `{sample}_count_detail.txt.gz` 4 columns: + - barcode + - gene ID + - UMI count + - read_count + +- `{sample}_counts.txt` 6 columns: + - Barcode: barcode sequence + - readcount: read count of each barcode + - UMI2: UMI count (with reads per UMI >= 2) for each barcode + - UMI: UMI count for each barcode + - geneID: gene count for each barcode + - mark: cell barcode or backgound barcode. + + `CB` cell + `UB` background + +- `{sample}_downsample.txt` 3 columns: + - percent: percentage of sampled reads + - median_geneNum: median gene number per cell + - saturation: sequencing saturation + +- `barcode_filter_magnitude.pdf` Barcode-UMI plot. + + +## Arguments +`--genomeDir` Required. Genome directory. + +`--expected_cell_num` Default `3000`. Expected cell number. + +`--cell_calling_method` Default `auto`. Cell calling methods. Choose from `auto`, `cellranger3` and `inflection`. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--bam` Required. BAM file from featureCounts. + +`--force_cell_num` Default `None`. Force the cell number to be this value ± 10%. + diff --git a/docs/tools/cutadapt.md b/docs/tools/cutadapt.md new file mode 100644 index 00000000..e75d6e72 --- /dev/null +++ b/docs/tools/cutadapt.md @@ -0,0 +1,44 @@ +## Features +- Trim adapters in R2 reads with cutadapt. Default adapters includes: + - polyT=A{18}, 18 A bases. + - p5=AGATCGGAAGAGCACACGTCTGAACTCCAGTCA, Illumina p5 adapter. + +## Output +- `cutadapt.log` Cutadapt output log file. +- `{sample}_clean_2.fq.gz` R2 reads file without adapters. + + +## Arguments +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--fq` Required. R2 reads from step Barcode. + +`--gzip` Output gzipped fastq + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tools/featureCounts.md b/docs/tools/featureCounts.md new file mode 100644 index 00000000..3822ca55 --- /dev/null +++ b/docs/tools/featureCounts.md @@ -0,0 +1,38 @@ +## Features + +- Assigning uniquely mapped reads to genomic features with FeatureCounts. + +## Output +- `{sample}` Numbers of reads assigned to features (or meta-features). + +- `{sample}_summary` Stat info for the overall summrization results, including number of +successfully assigned reads and number of reads that failed to be assigned due to +various reasons (these reasons are included in the stat info). + +- `{sample}_Aligned.sortedByCoord.out.bam.featureCounts.bam` featureCounts output BAM, +sorted by coordinates;BAM file contains tags as following(Software Version>=1.1.8): + - CB cell barcode + - UB UMI + - GN gene name + - GX gene id + +- `{sample}_name_sorted.bam` featureCounts output BAM, sorted by read name. + + +## Arguments +`--gtf_type` Specify feature type in GTF annotation + +`--genomeDir` Required. Genome directory. + +`--input` Required. BAM file path. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/tools/sample.md b/docs/tools/sample.md new file mode 100644 index 00000000..e6fb6ce3 --- /dev/null +++ b/docs/tools/sample.md @@ -0,0 +1,17 @@ + + +## Arguments +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--fq1` read1 fq file + +`--chemistry` chemistry version + diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md new file mode 100644 index 00000000..d2fbaa04 --- /dev/null +++ b/docs/tools/target_metrics.md @@ -0,0 +1,28 @@ +## Features +- Filter bam file + - Filter reads that are not cell-associated. + - Filter reads that are not mapped to target genes. + +- Collect enrichment metrics. + +## Output +- `filtered.bam` BAM file after filtering. + + +## Arguments +`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. + +`--bam` Input bam file + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/vdj/count_vdj.md b/docs/vdj/count_vdj.md new file mode 100644 index 00000000..bd10f86d --- /dev/null +++ b/docs/vdj/count_vdj.md @@ -0,0 +1,37 @@ +## Features +- Cell-calling based on barcode-UMI rank. +- Summarize clonetypes infomation. + +## Output +- `{sample}_cell_confident.tsv` The clone type of VDJ cell barcode, each chain occupies one line. + +- `{sample}_cell_confident_count.tsv` The clone type of VDJ cell barcode, each cell occupies one line. + +- `{sample}_clonetypes.tsv` The count and percentage of each clonetypes of VDJ cell barcode. + +- `{sample}_match_clonetypes.tsv` When summarize clonetypes, only consider barcodes in the match scRNA-Seq library. +This file will only be produced when the `match_dir` parameter is provided. + + +## Arguments +`--type` Required. `TCR` or `BCR`. + +`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell. + +`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. +For each (barcode, chain) combination, only UMI>=iUMI is considered valid. + +`--UMI_count_filter_file` Required. File from step mapping_vdj. + +`--match_dir` Match celescope scRNA-Seq directory. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/vdj/mapping_vdj.md b/docs/vdj/mapping_vdj.md new file mode 100644 index 00000000..25bb304a --- /dev/null +++ b/docs/vdj/mapping_vdj.md @@ -0,0 +1,35 @@ +## Features +- Align R2 reads to IGMT(http://www.imgt.org/) database sequences with mixcr. + +## Output +- `{sample}_consensus.fasta` Fasta file after UMI consensus. + +- `{sample}_UMI_count_unfiltered.tsv` UMI reading for each (barcode, chain, VJ_pair) combination. + +- `{sample}_UMI_count_filtered.tsv` For each (barcode, chain) combination, only the record with the +most VJ_pair UMI reads is kept. + +- `{sample}_align.txt` Result report. + +- `{sample}_alignments.txt` The alignment result of each UMI/read. + + +## Arguments +`--type` TCR or BCR + +`--species` Default `hs`. `hs`(human) or `mmu`(mouse). + +`--not_consensus` Input fastq is not consensused. + +`--fq` Required. Input fastq file. + +`--outdir` Output diretory. + +`--assay` Assay name. + +`--sample` Sample name. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + diff --git a/docs/vdj/multi_vdj.md b/docs/vdj/multi_vdj.md new file mode 100644 index 00000000..2db8fd77 --- /dev/null +++ b/docs/vdj/multi_vdj.md @@ -0,0 +1,84 @@ + + +## Arguments +`--mod` mod, sjm or shell + +`--mapfile` tsv file, 4 columns: + 1st col: LibName; + 2nd col: DataDir; + 3rd col: SampleName; + 4th col: optional; + +`--rm_files` remove redundant fq.gz and bam after running + +`--steps_run` Steps to run. Multiple Steps are separated by comma. + +`--outdir` Output directory. + +`--thread` Thread to use. + +`--debug` If this argument is used, celescope may output addtional file for debugging. + +`--chemistry` Predefined (pattern, barcode whitelist, linker whitelist) combinations. Can be one of: +- `auto` Default value. Used for Singleron GEXSCOPE libraries >= scopeV2 and automatically detects the combinations. +- `scopeV1` Used for legacy Singleron GEXSCOPE scopeV1 libraries. +- `customized` Used for user defined combinations. You need to provide `pattern`, `whitelist` and `linker` at the +same time. + +`--pattern` The pattern of R1 reads, e.g. `C8L16C8L16C8L1U12T18`. The number after the letter represents the number + of bases. +- `C`: cell barcode +- `L`: linker(common sequences) +- `U`: UMI +- `T`: poly T + +`--whitelist` Cell barcode whitelist file path, one cell barcode per line. + +`--linker` Linker whitelist file path, one linker per line. + +`--lowQual` Default 0. Bases in cell barcode and UMI whose phred value are lower than lowQual will be regarded as low-quality bases. + +`--lowNum` The maximum allowed lowQual bases in cell barcode and UMI. + +`--nopolyT` Outputs R1 reads without polyT. + +`--noLinker` Outputs R1 reads without correct linker. + +`--allowNoPolyT` Allow valid reads without polyT. + +`--allowNoLinker` Allow valid reads without correct linker. + +`--gzip` Output gzipped fastq files. + +`--adapter_fasta` Addtional adapter fasta file. + +`--minimum_length` Default `20`. Discard processed reads that are shorter than LENGTH. + +`--nextseq_trim` Default `20`. Quality trimming of reads using two-color chemistry (NextSeq). +Some Illumina instruments use a two-color chemistry to encode the four bases. +This includes the NextSeq and the NovaSeq. +In those instruments, a ‘dark cycle’ (with no detected color) encodes a G. +However, dark cycles also occur when sequencing “falls off” the end of the fragment. +The read then contains a run of high-quality, but incorrect “G” calls at its 3’ end. + +`--overlap` Default `10`. Since Cutadapt allows partial matches between the read and the adapter sequence, +short matches can occur by chance, leading to erroneously trimmed bases. +For example, roughly 0.25 of all reads end with a base that is identical to the first base of the adapter. +To reduce the number of falsely trimmed bases, the alignment algorithm requires that +at least {overlap} bases match between adapter and read. + +`--insert` Default `150`. Read2 insert length. + +`--threshold` Default 0.5. Valid base threshold. + +`--species` Default `hs`. `hs`(human) or `mmu`(mouse). + +`--not_consensus` Input fastq is not consensused. + +`--type` Required. `TCR` or `BCR`. + +`--UMI_min` Default `auto`. Minimum UMI number to filter. The barcode with UMI>=UMI_min is considered to be cell. + +`--iUMI` Default `1`. Minimum number of UMI of identical receptor type and CDR3. +For each (barcode, chain) combination, only UMI>=iUMI is considered valid. + diff --git a/generate_docs.py b/generate_docs.py index e4788abc..ddc2a421 100644 --- a/generate_docs.py +++ b/generate_docs.py @@ -11,11 +11,12 @@ from celescope.__init__ import ASSAY_DICT, RELEASED_ASSAYS PRE_PROCESSING_STEPS = ('sample', 'barcode', 'cutadapt') DOCS_DIR = 'docs/' TEMPLATE_DIR = 'docs_template/' -MANUAL_MD = f'{DOCS_DIR}/manual.md' +MANUAL = f'{DOCS_DIR}/manual.md' MANUAL_TEMPLATE = f'{DOCS_DIR}/manual_template.md' def get_argument_docs_from_parser(parser): + argument_docs = "" for argument in parser._option_string_actions: if not argument in ['-h', '--help']: help_msg = parser._option_string_actions[argument].help @@ -52,20 +53,20 @@ class Docs(): self.assay = assay init_module = utils.find_assay_init(assay) - self.steps = init_module.__STEPS__ + self.steps = init_module.__STEPS__.copy() self.steps.append(f'multi_{assay}') - folder = f'{DOCS_DIR}/{assay}/' self.out_md_dict = {} self.relative_md_path = {} - for step in self.steps: - self.out_md_dict[step] = f'{folder}/{step}.md' - self.relative_md_path[step] = f'{assay}/{step}.md' + self.release_bool = self.assay in RELEASED_ASSAYS - if not os.path.exists(folder): - os.system(f'mkdir -p {folder}') + assay_dir = f'docs/{assay}' + if not os.path.exists(assay_dir): + os.system(f'mkdir -p {assay_dir}') + @utils.add_log def get_argument_docs(self, step, step_module): + self.get_argument_docs.logger.info(step) if step.startswith("multi"): multi_class = getattr(step_module, f'Multi_{self.assay}') multi_obj = multi_class(self.assay) @@ -75,11 +76,18 @@ class Docs(): func_opts = getattr(step_module, f"get_opts_{step}") func_opts(parser, sub_program=True) argument_docs = get_argument_docs_from_parser(parser) - return argument_docs + return argument_docs def write_step_doc(self, step): + """ + folder: docs/folder/*.md + """ step_module = utils.find_step_module(self.assay, step) + folder = step_module.__name__.split('.')[1] + self.out_md_dict[step] = f'docs/{folder}/{step}.md' + self.relative_md_path[step] = f'{folder}/{step}.md' + class_docs = get_class_docs(step_module) argument_docs = self.get_argument_docs(step, step_module) @@ -87,12 +95,24 @@ class Docs(): out_file.write(class_docs) out_file.write(argument_docs) -def write_step_in_manual(md_path, step, manual_handle): - """ - - [mkref](rna/mkref.md) - """ - if not step in PRE_PROCESSING_STEPS: - manual_handle.write(f'- [{step}]({md_path})\n') + def run(self): + if self.release_bool: + with open(MANUAL, 'a') as writer: + writer.write(f'## {ASSAY_DICT[self.assay]}\n') + + for step in self.steps: + self.write_step_doc(step) + if self.release_bool: + self.write_step_in_manual(step) + + + def write_step_in_manual(self, step): + """ + - [mkref](rna/mkref.md) + """ + if not step in PRE_PROCESSING_STEPS: + with open(MANUAL, 'a') as writer: + writer.write(f'- [{step}]({self.relative_md_path[step]})\n') """ @@ -129,8 +149,23 @@ def write_manual(md_path_dict): write_step_in_manual(md_path, step, manual_handle) +def main(): + cmd = ( + f"rm -r {DOCS_DIR};" + f"cp -r {TEMPLATE_DIR} {DOCS_DIR}" + ) + os.system(cmd) + + with open(MANUAL, 'w') as manual_handle: + with open(MANUAL_TEMPLATE, 'r') as manual_template: + manual_handle.write(manual_template.read()) + + for assay in ASSAY_DICT: + docs_obj = Docs(assay) + docs_obj.run() + + + if __name__ == "__main__": - cmd = f"cp -r {TEMPLATE_DIR} {DOCS_DIR}" - os.system(cmd) - \ No newline at end of file + main() -- Gitee From ff359a28a975736dc3d8d4191385ed103a07e8e9 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 14:16:44 +0800 Subject: [PATCH 88/96] update --- celescope/snp/multi_snp.py | 24 ++++++++++++------------ docs/snp/multi_snp.md | 19 +++++++++++++++++++ generate_docs.py | 9 ++++++++- 3 files changed, 39 insertions(+), 13 deletions(-) diff --git a/celescope/snp/multi_snp.py b/celescope/snp/multi_snp.py index b0d3eba1..7215ad08 100755 --- a/celescope/snp/multi_snp.py +++ b/celescope/snp/multi_snp.py @@ -6,22 +6,22 @@ class Multi_snp(Multi): """ Usage ``` - multi_snp\ - --mapfile ./test1.mapfile\ - --genomeDir {genomeDir after running celescope snp mkref}\ - --thread 10\ - --mod shell\ - --gene_list gene_list.tsv\ - --annovar_config annovar.config\ + multi_snp\\ + --mapfile ./test1.mapfile\\ + --genomeDir {genomeDir after running celescope snp mkref}\\ + --thread 10\\ + --mod shell\\ + --gene_list gene_list.tsv\\ + --annovar_config annovar.config\\ ``` annovar_config file ``` [ANNOVAR] - dir = /Public/Software/annovar/ - db = /SGRNJ/Database/script/database/annovar/humandb - buildver = hg38 - protocol = refGene,cosmic70 - operation = g,f + dir = /Public/Software/annovar/ + db = /SGRNJ/Database/script/database/annovar/humandb + buildver = hg38 + protocol = refGene,cosmic70 + operation = g,f ``` """ diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md index ee762e9a..481b950f 100644 --- a/docs/snp/multi_snp.md +++ b/docs/snp/multi_snp.md @@ -1,3 +1,22 @@ +## Usage +``` +multi_snp\ + --mapfile ./test1.mapfile\ + --genomeDir {genomeDir after running celescope snp mkref}\ + --thread 10\ + --mod shell\ + --gene_list gene_list.tsv\ + --annovar_config annovar.config\ +``` +annovar_config file +``` +[ANNOVAR] +dir = /Public/Software/annovar/ +db = /SGRNJ/Database/script/database/annovar/humandb +buildver = hg38 +protocol = refGene,cosmic70 +operation = g,f +``` ## Arguments diff --git a/generate_docs.py b/generate_docs.py index ddc2a421..de87484c 100644 --- a/generate_docs.py +++ b/generate_docs.py @@ -36,7 +36,14 @@ def get_class_docs(step_module): if class_obj.__module__ != step_module.__name__: continue doc = inspect.getdoc(class_obj) - if doc and "Features" in doc: + + write_bool = False + if doc: + for title in titles: + if title in doc: + write_bool = True + + if write_bool: for line in doc.split('\n'): for title in titles: if line.find(title) != -1: -- Gitee From b1173d0bcfc6f05c985535d608c5ba697ff2be2a Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 14:22:40 +0800 Subject: [PATCH 89/96] add dynaseq release doc --- celescope/__init__.py | 2 +- docs/manual.md | 10 ++++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/celescope/__init__.py b/celescope/__init__.py index e89d5496..2ac251a9 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -21,7 +21,7 @@ ASSAY_DICT = { ROOT_PATH = os.path.dirname(__file__) -RELEASED_ASSAYS = ['rna', 'vdj', 'tag', ] +RELEASED_ASSAYS = ['rna', 'vdj', 'tag', 'dynaseq'] HELP_DICT = { 'match_dir': 'Match celescope scRNA-Seq directory.', diff --git a/docs/manual.md b/docs/manual.md index b39281ce..dbdc7542 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -39,3 +39,13 @@ Currently, CeleScope includes the follwing pipelines: - [analysis_tag](tag/analysis_tag.md) - [split_tag](tag/split_tag.md) - [multi_tag](tag/multi_tag.md) +## Single Cell Dynaseq +- [star](rna/star.md) +- [featureCounts](tools/featureCounts.md) +- [count](tools/count.md) +- [analysis](rna/analysis.md) +- [conversion](dynaseq/conversion.md) +- [subsitution](dynaseq/subsitution.md) +- [replacement](dynaseq/replacement.md) +- [replace_tsne](dynaseq/replace_tsne.md) +- [multi_dynaseq](dynaseq/multi_dynaseq.md) -- Gitee From 9e6fbe0d3f74aa81b184774f950237185c1989d8 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 14:24:00 +0800 Subject: [PATCH 90/96] typo --- celescope/__init__.py | 2 +- docs/manual.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/celescope/__init__.py b/celescope/__init__.py index 2ac251a9..ceede280 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -16,7 +16,7 @@ ASSAY_DICT = { 'tag': 'Single-cell tag', 'citeseq': 'Single Cell CITE-Seq', 'tcr_fl': 'Single Cell full length TCR', - 'dynaseq': 'Single Cell Dynaseq' + 'dynaseq': 'Single-cell dynaseq' } ROOT_PATH = os.path.dirname(__file__) diff --git a/docs/manual.md b/docs/manual.md index dbdc7542..6ffcec89 100644 --- a/docs/manual.md +++ b/docs/manual.md @@ -39,7 +39,7 @@ Currently, CeleScope includes the follwing pipelines: - [analysis_tag](tag/analysis_tag.md) - [split_tag](tag/split_tag.md) - [multi_tag](tag/multi_tag.md) -## Single Cell Dynaseq +## Single-cell dynaseq - [star](rna/star.md) - [featureCounts](tools/featureCounts.md) - [count](tools/count.md) -- Gitee From 534298a9fcbf923881d42317583c3fb7d18ee116 Mon Sep 17 00:00:00 2001 From: zhouyiqi Date: Fri, 25 Jun 2021 14:29:24 +0800 Subject: [PATCH 91/96] update docs --- celescope/__init__.py | 3 ++- celescope/snp/mkref.py | 3 ++- docs/snp/mkref.md | 2 +- docs/snp/multi_snp.md | 2 +- docs/tools/target_metrics.md | 2 +- 5 files changed, 7 insertions(+), 5 deletions(-) diff --git a/celescope/__init__.py b/celescope/__init__.py index ceede280..a4d2a5d0 100755 --- a/celescope/__init__.py +++ b/celescope/__init__.py @@ -25,8 +25,9 @@ RELEASED_ASSAYS = ['rna', 'vdj', 'tag', 'dynaseq'] HELP_DICT = { 'match_dir': 'Match celescope scRNA-Seq directory.', - 'gene_list': 'Gene list file, one gene symbol per line. Only results of these genes are reported.', + 'gene_list': 'Required. Gene list file, one gene symbol per line. Only results of these genes are reported.', 'genomeDir': 'Genome directory after running `mkref`.', 'thread': 'Thread to use.', 'debug': 'If this argument is used, celescope may output addtional file for debugging.', + 'fasta': 'Required. Genome fasta file. Use relative path to `genomeDir`.', } diff --git a/celescope/snp/mkref.py b/celescope/snp/mkref.py index e8659428..b4795e33 100644 --- a/celescope/snp/mkref.py +++ b/celescope/snp/mkref.py @@ -5,6 +5,7 @@ import subprocess import celescope.tools.utils as utils from celescope.tools.mkref import Mkref from celescope.tools.mkref import get_opts_mkref as opts +from celescope.__init__ import HELP_DICT class Mkref_snp(Mkref): @@ -78,4 +79,4 @@ def mkref(args): def get_opts_mkref(parser, sub_program): opts(parser, sub_program) if sub_program: - parser.add_argument("--fasta", help="fasta file", required=True) + parser.add_argument("--fasta", help=HELP_DICT['fasta'], required=True) diff --git a/docs/snp/mkref.md b/docs/snp/mkref.md index 34858a56..323b3771 100644 --- a/docs/snp/mkref.md +++ b/docs/snp/mkref.md @@ -25,5 +25,5 @@ celescope snp mkref \ `--dry_run` Only write config file and exit. -`--fasta` fasta file +`--fasta` Required. Genome fasta file. Use relative path to `genomeDir`. diff --git a/docs/snp/multi_snp.md b/docs/snp/multi_snp.md index 481b950f..e6b6435a 100644 --- a/docs/snp/multi_snp.md +++ b/docs/snp/multi_snp.md @@ -105,7 +105,7 @@ is higher than or equal to this value. `--gtf_type` Specify feature type in GTF annotation -`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. +`--gene_list` Required. Gene list file, one gene symbol per line. Only results of these genes are reported. `--genomeDir` Genome directory after running `mkref`. diff --git a/docs/tools/target_metrics.md b/docs/tools/target_metrics.md index d2fbaa04..9e170bf9 100644 --- a/docs/tools/target_metrics.md +++ b/docs/tools/target_metrics.md @@ -10,7 +10,7 @@ ## Arguments -`--gene_list` Gene list file, one gene symbol per line. Only results of these genes are reported. +`--gene_list` Required. Gene list file, one gene symbol per line. Only results of these genes are reported. `--bam` Input bam file -- Gitee From 75446cbacb1288a8c86dcd65540c8ee85f7d9882 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 25 Jun 2021 18:19:30 +0800 Subject: [PATCH 92/96] add mapping --- celescope/templates/html/trust_vdj/base.html | 4 +- ...mble_summary.html => mapping_summary.html} | 4 +- celescope/trust_vdj/__init__.py | 3 +- .../{trust_assemble.py => assemble.py} | 223 ++++++------------ celescope/trust_vdj/convert.py | 7 +- celescope/trust_vdj/mapping.py | 122 ++++++++++ celescope/trust_vdj/multi_trust_vdj.py | 13 +- celescope/trust_vdj/res_filter.py | 8 +- 8 files changed, 222 insertions(+), 162 deletions(-) rename celescope/templates/html/trust_vdj/{trust_assemble_summary.html => mapping_summary.html} (93%) rename celescope/trust_vdj/{trust_assemble.py => assemble.py} (32%) create mode 100644 celescope/trust_vdj/mapping.py diff --git a/celescope/templates/html/trust_vdj/base.html b/celescope/templates/html/trust_vdj/base.html index fcd8607c..36025b76 100644 --- a/celescope/templates/html/trust_vdj/base.html +++ b/celescope/templates/html/trust_vdj/base.html @@ -133,8 +133,8 @@ {% include "html/trust_vdj/convert_summary.html"%} {% endif %} - {% if trust_assemble_summary is defined %} - {% include "html/trust_vdj/trust_assemble_summary.html"%} + {% if mapping_summary is defined %} + {% include "html/trust_vdj/mapping_summary.html"%} {% endif %} {% if res_filter_summary is defined %} diff --git a/celescope/templates/html/trust_vdj/trust_assemble_summary.html b/celescope/templates/html/trust_vdj/mapping_summary.html similarity index 93% rename from celescope/templates/html/trust_vdj/trust_assemble_summary.html rename to celescope/templates/html/trust_vdj/mapping_summary.html index 0d18b19f..797e9199 100644 --- a/celescope/templates/html/trust_vdj/trust_assemble_summary.html +++ b/celescope/templates/html/trust_vdj/mapping_summary.html @@ -13,7 +13,7 @@

reads Mapped to TRB: reads confidently mapped to TRB chain.

- {% for item in trust_assemble_summary %} + {% for item in mapping_summary %} {% if loop.index <= (loop.length+1)/2 %} {% for i in item %} @@ -25,7 +25,7 @@
- {% for item in trust_assemble_summary %} + {% for item in mapping_summary %} {% if loop.index > (loop.length+1)/2 %} {% for i in item %} diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py index f95b64a2..8b95beeb 100644 --- a/celescope/trust_vdj/__init__.py +++ b/celescope/trust_vdj/__init__.py @@ -1,6 +1,7 @@ __STEPS__ = [ 'sample', 'convert', - 'trust_assemble', + 'assemble', + 'mapping', 'res_filter'] __ASSAY__ = 'trust_vdj' diff --git a/celescope/trust_vdj/trust_assemble.py b/celescope/trust_vdj/assemble.py similarity index 32% rename from celescope/trust_vdj/trust_assemble.py rename to celescope/trust_vdj/assemble.py index 8cf8f76a..6eb35684 100644 --- a/celescope/trust_vdj/trust_assemble.py +++ b/celescope/trust_vdj/assemble.py @@ -1,19 +1,20 @@ import os from celescope.tools import utils from celescope.tools.Step import Step, s_common -from celescope.tracer_vdj.split_fastq import get_barcodes -from celescope.tools.barcode import * import pysam import pandas as pd from collections import defaultdict +import glob +import re +from Bio.Seq import Seq -TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/run-trust4' +TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/' - -def count_fq(fq1): +@utils.add_log +def count_fq(fq): dic = defaultdict(list) - with pysam.FastxFile(fq1) as fq: + with pysam.FastxFile(fq) as fq: for entry in fq: attr = entry.sequence cb = attr[:24] @@ -24,146 +25,85 @@ def count_fq(fq1): dic['seq_name'].append(name) count_df = pd.DataFrame(dic, columns=list(dic.keys())) - - return count_df - -@utils.add_log -def match_barcodes(outdir, match_dir, Seqtype, fq1): - annotated_bcs = get_barcodes(match_dir, Seqtype) - bcs_df = pd.DataFrame(annotated_bcs, columns=['barcode']) - count_df = count_fq(fq1) + return count_df - # count UMI - df_umi = count_df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'}) - df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'}) - df_umi = df_umi.sort_values(by='UMI', ascending=False) - df_umi.to_csv(f'{outdir}/count.txt', sep='\t', index=False) - df_n = pd.merge(bcs_df, count_df, on='barcode', how='inner') - seqnames = df_n['seq_name'].tolist() - seqlist = open(f'{outdir}/seqlist.txt', 'w') - for name in seqnames: - seqlist.write(str(name) + '\n') +class Assemble(Step): + """ + Features + - Get fq file + """ -def clean_fq(fq1, fq2, outdir, sample, species): + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) - prefix = f'{outdir}/{sample}_clean' + self.outdir = args.outdir + self.fq1 = args.fq1 + self.fq2 = args.fq2 + self.sample = args.sample + self.species = args.species + self.speed_up = args.speed_up + self.match_dir = args.match_dir + self.cells = args.cells - cmd = ( - f'/SGRNJ03/randd/zhouxin/software/TRUST4/fastq-extractor ' - f'-t 10 -f /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa ' - f'-o {prefix} --barcodeStart 0 --barcodeEnd 23 ' - f'-u {fq2} ' - f'--barcode {fq1}' - ) + @utils.add_log + def get_barcodes(self): + tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv') + tsne = tsne[0] + tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0) + barcodes = tsne_coord.index.tolist() - os.system(cmd) + # write barcodes + res = [] + for barcode in barcodes: + barcode = Seq(barcode) + barcode_reversed = barcode.reverse_complement() + bc = str(barcode_reversed) + res.append(bc) + df = pd.DataFrame(res, columns=['barcode']) -def mapping_summary(outdir, Seqtype, fq, species): - - stat_file = outdir + '/stat.txt' + return df - trust_assemble_summary = [] + @utils.add_log + def cut_off(self): + barcodes = self.get_barcodes() + df = count_fq(self.fq1) + df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'}) + df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'}) - total_mapped = 0 + df_umi = df_umi.sort_values(by='UMI', ascending=False) + df_umi = df_umi.reset_index() - #with pysam.FastxFile(fq) as fh: - #total_count = 0 - #for entry in fh: - #total_count += 1 + UMI_num = int(self.cells) + rank = UMI_num / 100 + rank_UMI = df_umi.loc[rank, 'UMI'] + UMI_min = int(rank_UMI / 10) - if Seqtype == 'TCR': - loci = ['TRA', 'TRB'] - stat_string = 'All reads Mapped to TRA and TRB' + df_umi_filtered = df_umi[df_umi.UMI >= UMI_min] - elif Seqtype == 'BCR': - loci = ['IGH', 'IGL', 'IGK'] - stat_string = 'All reads Mapped to IGH, IGL and IGK' + df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner') - for locus in loci: - cmd = ( - f'source activate bracer; ' - f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' - f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} ' - f'-U {fq} ' - f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1' - ) - os.system(cmd) - - with open(f'{outdir}/log') as fh: - for line in fh: - if 'reads; of these:' in line: - attr = re.findall(r'\d+', line) - total_count = int(attr[0]) - if 'aligned exactly 1 time' in line: - res = re.findall(r"\d+", line) - item = f'Reads mapped to {locus}' - count = int(res[0]) - total_mapped += count - trust_assemble_summary.append({ - 'item': item, - 'count': count, - 'total_count': total_count, - }) - # os.system(f'rm {outdir}/{locus}.sam') - - # total mapping - cmd = ( - f'source activate bracer; ' - f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' - f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} ' - f'-U {fq} ' - f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1' - ) - os.system(cmd) - with open(f'{outdir}/log') as fh: - for line in fh: - if 'reads; of these:' in line: - attr = re.findall(r'\d+', line) - total_count = int(attr[0]) - if 'aligned exactly 1 time' in line: - res = re.findall(r"\d+", line) - count = int(res[0]) - trust_assemble_summary.insert(0, { - 'item': stat_string, - 'count': count, - 'total_count': total_count, - }) - - os.system(f'rm {outdir}/*.sam') - os.system(f'rm {outdir}/log') - - df = pd.DataFrame(trust_assemble_summary, columns=['item', 'count', 'total_count']) - - utils.gen_stat(df, stat_file) - - -class Trust_assemble(Step): - """ - Features + matched_barcodes = df_tmp.barcode.tolist() + with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh: + for barcode in matched_barcodes: + fh.write(str(barcode)+ '\n') + string = f'Get {len(matched_barcodes)} matched barcodes' - - Get fq file - """ + Assemble.cut_off.logger.info(string) - def __init__(self, args, step_name): - Step.__init__(self, args, step_name) + df_all = pd.merge(df_tmp, df, on='barcode', how='outer') + seq_list = df_all['seq_name'].tolist() - self.outdir = args.outdir - self.match_dir = args.match_dir - self.Seqtype = args.Seqtype - self.fq1 = args.fq1 - self.fq2 = args.fq2 - self.sample = args.sample - self.species = args.species - self.speed_up = args.speed_up + with open(f'{self.outdir}/seqlist.txt', 'w') as fh: + for name in seq_list: + fh.write(str(name)+'\n') @utils.add_log def getFqfile(self): - match_barcodes(self.outdir, self.match_dir, self.Seqtype, self.fq1) cmd1 = ( f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq' @@ -179,19 +119,19 @@ class Trust_assemble(Step): @utils.add_log def run(self): - if not os.path.exists(f'{self.outdir}/{self.sample}_matched_R2.fq'): - self.getFqfile() + self.cut_off() + self.getFqfile() species = self.species - index_file = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_ref.fa' - ref = f'/SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{species}_IMGT+C.fa' + index_file = f'{TRUST}/index/{species}/{species}_ref.fa' + ref = f'{TRUST}/index/{species}/{species}_IMGT+C.fa' string1 = '' if self.speed_up: string1 = '--repseq ' cmd = ( - f'{TRUST} -t {self.thread} ' + f'{TRUST}/run-trust4 -t {self.thread} ' f'-u {self.outdir}/{self.sample}_matched_R2.fq ' f'--barcode {self.outdir}/{self.sample}_matched_R1.fq ' f'--barcodeRange 0 23 + ' @@ -201,7 +141,7 @@ class Trust_assemble(Step): f'-o {self.sample} --od {self.outdir}/TRUST4' ) - Trust_assemble.run.logger.info(cmd) + Assemble.run.logger.info(cmd) if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'): os.system(cmd) @@ -209,32 +149,25 @@ class Trust_assemble(Step): #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' # report - clean_fq(self.fq1, self.fq2, self.outdir, self.sample, species) - - fq = f'{self.outdir}/{self.sample}_clean.fq' - - mapping_summary(self.outdir, self.Seqtype, fq, species) - - os.remove(f'{self.outdir}/seqlist.txt') - - self.clean_up() + os.system(f'rm {self.outdir}/seqlist.txt') @utils.add_log -def trust_assemble(args): - step_name = 'trust_assemble' - trust_assemble_obj = Trust_assemble(args, step_name) - trust_assemble_obj.run() +def assemble(args): + step_name = 'assemble' + assemble_obj = Assemble(args, step_name) + assemble_obj.run() -def get_opts_trust_assemble(parser, sub_program): +def get_opts_assemble(parser, sub_program): if sub_program: parser = s_common(parser) parser.add_argument('--fq1', help='R1 reads from barcode step', required=True) parser.add_argument('--fq2', help='R2 reads from barcode step', required=True) - parser.add_argument('--match_dir', help='match_dir', required=True) - parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--match_dir', help='rna analysis dir', required=True) + parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + parser.add_argument('--cells', help='expected cell number', default=3000) parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true') diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py index 4039a215..511e4bcb 100644 --- a/celescope/trust_vdj/convert.py +++ b/celescope/trust_vdj/convert.py @@ -1,12 +1,7 @@ """barcode step.""" -import os import re -import subprocess -import sys -import glob -from collections import defaultdict, Counter -from itertools import combinations, product +from collections import Counter import pandas as pd import pysam diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py new file mode 100644 index 00000000..a303e46d --- /dev/null +++ b/celescope/trust_vdj/mapping.py @@ -0,0 +1,122 @@ +import pandas as pd +import glob +from celescope.tools.Step import Step, s_common +from celescope.tools import utils +import os +import re + + +class Mapping(Step): + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.outdir = args.outdir + self.match_dir = args.match_dir + self.Seqtype = args.Seqtype + self.sample = args.sample + self.species = args.species + + @utils.add_log + def align(self): + species = self.species + outdir = self.outdir + Seqtype = self.Seqtype + + stat_file = self.outdir + '/stat.txt' + fq = f'{outdir}/../02.assemble/TRUST4/{self.sample}_toassemble.fq' + + mapping_summary = [] + + total_mapped = 0 + + #with pysam.FastxFile(fq) as fh: + #total_count = 0 + #for entry in fh: + #total_count += 1 + + if Seqtype == 'TCR': + loci = ['TRA', 'TRB'] + stat_string = 'All reads Mapped to TRA and TRB' + + elif Seqtype == 'BCR': + loci = ['IGH', 'IGL', 'IGK'] + stat_string = 'All reads Mapped to IGH, IGL and IGK' + + for locus in loci: + cmd = ( + f'source activate bracer; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{locus} ' + f'-U {fq} ' + f'-S {outdir}/{locus}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + + with open(f'{outdir}/log') as fh: + for line in fh: + if 'reads; of these:' in line: + attr = re.findall(r'\d+', line) + total_count = int(attr[0]) + if 'aligned exactly 1 time' in line: + res = re.findall(r"\d+", line) + item = f'Reads mapped to {locus}' + count = int(res[0]) + total_mapped += count + mapping_summary.append({ + 'item': item, + 'count': count, + 'total_count': total_count, + }) + # os.system(f'rm {outdir}/{locus}.sam') + + # total mapping + cmd = ( + f'source activate full_len_VDJ; ' + f'bowtie2 -p 5 -k 1 --np 0 --rdg 1,1 --rfg 1,1 ' + f'-x /SGRNJ03/randd/zhouxin/software/TRUST4/index/{species}/{Seqtype} ' + f'-U {fq} ' + f'-S {outdir}/{Seqtype}.sam > {outdir}/log 2>&1' + ) + os.system(cmd) + with open(f'{outdir}/log') as fh: + for line in fh: + if 'reads; of these:' in line: + attr = re.findall(r'\d+', line) + total_count = int(attr[0]) + if 'aligned exactly 1 time' in line: + res = re.findall(r"\d+", line) + count = int(res[0]) + mapping_summary.insert(0, { + 'item': stat_string, + 'count': count, + 'total_count': total_count, + }) + + os.system(f'rm {outdir}/*.sam') + os.system(f'rm {outdir}/log') + + df = pd.DataFrame(mapping_summary, columns=['item', 'count', 'total_count']) + + utils.gen_stat(df, stat_file) + + @utils.add_log + def run(self): + self.align() + + self.clean_up() + + +def mapping(args): + step_name = 'mapping' + mapping_obj = Mapping(args, step_name) + mapping_obj.run() + + +def get_opts_mapping(parser, sub_program): + if sub_program: + parser = s_common(parser) + + parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) + parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + + diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py index 8c89ad45..bd5ecc1e 100644 --- a/celescope/trust_vdj/multi_trust_vdj.py +++ b/celescope/trust_vdj/multi_trust_vdj.py @@ -15,8 +15,8 @@ class Multi_trust_vdj(Multi): self.process_cmd(cmd, step, sample, m=5, x=1) - def trust_assemble(self, sample): - step = 'trust_assemble' + def assemble(self, sample): + step = 'assemble' cmd_line = self.get_cmd_line(step, sample) fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}' fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' @@ -29,6 +29,15 @@ class Multi_trust_vdj(Multi): self.process_cmd(cmd, step, sample, m=15, x=self.args.thread) + def mapping(self, sample): + step = 'mapping' + cmd_line = self.get_cmd_line(step, sample) + cmd = ( + f'{cmd_line}' + ) + self.process_cmd(cmd, step, sample, m=5, x=5) + + def res_filter(self, sample): step = 'res_filter' cmd_line = self.get_cmd_line(step, sample) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index ed7a2720..eb69837f 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -63,13 +63,13 @@ def get_clone_table(df, Seqtype): if Seqtype == 'BCR': chains = ['IGH', 'IGL', 'IGK'] paired_groups = ['IGH_IGL', 'IGH_IGK'] - for chain in chains: - tmp = df[df['V'].str.contains(chain, na=False)] + for c in chains: + tmp = df[df['V'].str.contains(c, na=False)] tmp = tmp.set_index('barcode') - tmp = tmp.rename(columns=lambda x: f'{chain}_'+x) + tmp = tmp.rename(columns=lambda x: f'{c}_'+x) res = pd.concat([res, tmp], axis=1, join='outer', sort=False).fillna('None') - group_type.append(f'{chain}_CDR3aa') + group_type.append(f'{c}_CDR3aa') Frequent = [''] * res.shape[0] res.insert(res.shape[1], 'Frequent', Frequent) -- Gitee From 58cd8ec57fe7d56a7ee33eb27e6923af2f0d7796 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Fri, 25 Jun 2021 20:06:30 +0800 Subject: [PATCH 93/96] change pipeline --- celescope/trust_vdj/__init__.py | 1 + celescope/trust_vdj/assemble.py | 109 +------------------- celescope/trust_vdj/mapping.py | 6 +- celescope/trust_vdj/matching.py | 135 +++++++++++++++++++++++++ celescope/trust_vdj/multi_trust_vdj.py | 29 +++++- celescope/trust_vdj/res_filter.py | 14 ++- 6 files changed, 180 insertions(+), 114 deletions(-) create mode 100644 celescope/trust_vdj/matching.py diff --git a/celescope/trust_vdj/__init__.py b/celescope/trust_vdj/__init__.py index 8b95beeb..0e8f35cf 100644 --- a/celescope/trust_vdj/__init__.py +++ b/celescope/trust_vdj/__init__.py @@ -1,6 +1,7 @@ __STEPS__ = [ 'sample', 'convert', + 'matching', 'assemble', 'mapping', 'res_filter'] diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py index 6eb35684..6109d967 100644 --- a/celescope/trust_vdj/assemble.py +++ b/celescope/trust_vdj/assemble.py @@ -1,39 +1,17 @@ import os from celescope.tools import utils from celescope.tools.Step import Step, s_common -import pysam import pandas as pd -from collections import defaultdict -import glob -import re -from Bio.Seq import Seq TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/' -@utils.add_log -def count_fq(fq): - dic = defaultdict(list) - with pysam.FastxFile(fq) as fq: - for entry in fq: - attr = entry.sequence - cb = attr[:24] - umi = attr[24:] - name = entry.name - dic['barcode'].append(cb) - dic['UMI'].append(umi) - dic['seq_name'].append(name) - - count_df = pd.DataFrame(dic, columns=list(dic.keys())) - - return count_df - class Assemble(Step): """ Features - - Get fq file + - Assemble TCR/BCR """ def __init__(self, args, step_name): @@ -45,83 +23,11 @@ class Assemble(Step): self.sample = args.sample self.species = args.species self.speed_up = args.speed_up - self.match_dir = args.match_dir - self.cells = args.cells - - @utils.add_log - def get_barcodes(self): - tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv') - tsne = tsne[0] - tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0) - barcodes = tsne_coord.index.tolist() - - # write barcodes - res = [] - for barcode in barcodes: - barcode = Seq(barcode) - barcode_reversed = barcode.reverse_complement() - bc = str(barcode_reversed) - res.append(bc) - - df = pd.DataFrame(res, columns=['barcode']) - - return df - - @utils.add_log - def cut_off(self): - barcodes = self.get_barcodes() - df = count_fq(self.fq1) - df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'}) - df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'}) - - df_umi = df_umi.sort_values(by='UMI', ascending=False) - df_umi = df_umi.reset_index() - - UMI_num = int(self.cells) - rank = UMI_num / 100 - rank_UMI = df_umi.loc[rank, 'UMI'] - UMI_min = int(rank_UMI / 10) - - df_umi_filtered = df_umi[df_umi.UMI >= UMI_min] - - df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner') - - matched_barcodes = df_tmp.barcode.tolist() - with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh: - for barcode in matched_barcodes: - fh.write(str(barcode)+ '\n') - string = f'Get {len(matched_barcodes)} matched barcodes' - - Assemble.cut_off.logger.info(string) - - df_all = pd.merge(df_tmp, df, on='barcode', how='outer') - seq_list = df_all['seq_name'].tolist() - - with open(f'{self.outdir}/seqlist.txt', 'w') as fh: - for name in seq_list: - fh.write(str(name)+'\n') - - - @utils.add_log - def getFqfile(self): - - cmd1 = ( - f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq' - ) - os.system(cmd1) - - cmd2 = ( - f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq' - ) - os.system(cmd2) @utils.add_log def run(self): - self.cut_off() - self.getFqfile() - species = self.species index_file = f'{TRUST}/index/{species}/{species}_ref.fa' @@ -132,8 +38,8 @@ class Assemble(Step): string1 = '--repseq ' cmd = ( f'{TRUST}/run-trust4 -t {self.thread} ' - f'-u {self.outdir}/{self.sample}_matched_R2.fq ' - f'--barcode {self.outdir}/{self.sample}_matched_R1.fq ' + f'-u {self.fq2} ' + f'--barcode {self.fq1} ' f'--barcodeRange 0 23 + ' f'-f {index_file} ' f'--ref {ref} ' @@ -148,9 +54,6 @@ class Assemble(Step): #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' - # report - os.system(f'rm {self.outdir}/seqlist.txt') - @utils.add_log def assemble(args): @@ -162,12 +65,10 @@ def assemble(args): def get_opts_assemble(parser, sub_program): if sub_program: parser = s_common(parser) - parser.add_argument('--fq1', help='R1 reads from barcode step', required=True) - parser.add_argument('--fq2', help='R2 reads from barcode step', required=True) - parser.add_argument('--match_dir', help='rna analysis dir', required=True) + parser.add_argument('--fq1', help='R1 reads from match step', required=True) + parser.add_argument('--fq2', help='R2 reads from match step', required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) - parser.add_argument('--cells', help='expected cell number', default=3000) parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true') diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py index a303e46d..d187befa 100644 --- a/celescope/trust_vdj/mapping.py +++ b/celescope/trust_vdj/mapping.py @@ -11,19 +11,20 @@ class Mapping(Step): Step.__init__(self, args, step_name) self.outdir = args.outdir - self.match_dir = args.match_dir self.Seqtype = args.Seqtype self.sample = args.sample self.species = args.species + self.fq = args.fq + @utils.add_log def align(self): species = self.species outdir = self.outdir Seqtype = self.Seqtype + fq = self.fq stat_file = self.outdir + '/stat.txt' - fq = f'{outdir}/../02.assemble/TRUST4/{self.sample}_toassemble.fq' mapping_summary = [] @@ -115,6 +116,7 @@ def mapping(args): def get_opts_mapping(parser, sub_program): if sub_program: parser = s_common(parser) + parser.add_argument('--fq', help='to assemble fastq', required=True) parser.add_argument('--Seqtype', help='select TCR or BCR', choices=["TCR", "BCR"], required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py new file mode 100644 index 00000000..17c20d6f --- /dev/null +++ b/celescope/trust_vdj/matching.py @@ -0,0 +1,135 @@ +import os +from celescope.tools import utils +from celescope.tools.Step import Step, s_common +import pysam +import pandas as pd +from collections import defaultdict +import glob +import re +from Bio.Seq import Seq + + +@utils.add_log +def count_fq(fq): + dic = defaultdict(list) + with pysam.FastxFile(fq) as fq: + for entry in fq: + attr = entry.sequence + cb = attr[:24] + umi = attr[24:] + name = entry.name + dic['barcode'].append(cb) + dic['UMI'].append(umi) + dic['seq_name'].append(name) + + count_df = pd.DataFrame(dic, columns=list(dic.keys())) + + return count_df + + +class Matching(Step): + def __init__(self, args, step_name): + Step.__init__(self, args, step_name) + + self.outdir = args.outdir + self.fq1 = args.fq1 + self.fq2 = args.fq2 + self.sample = args.sample + self.match_dir = args.match_dir + self.cells = args.cells + + + @utils.add_log + def get_barcodes(self): + tsne = glob.glob(f'{self.match_dir}/06.analysis/*_tsne_coord.tsv') + tsne = tsne[0] + tsne_coord = pd.read_csv(tsne, sep='\t', index_col=0) + barcodes = tsne_coord.index.tolist() + + # write barcodes + res = [] + for barcode in barcodes: + barcode = Seq(barcode) + barcode_reversed = barcode.reverse_complement() + bc = str(barcode_reversed) + res.append(bc) + + df = pd.DataFrame(res, columns=['barcode']) + + return df + + + @utils.add_log + def cut_off(self): + barcodes = self.get_barcodes() + df = count_fq(self.fq1) + df_umi = df.groupby(['barcode', 'UMI'], as_index=False).agg({'seq_name': 'count'}) + df_umi = df_umi.groupby(['barcode'], as_index=False).agg({'UMI': 'count'}) + + df_umi = df_umi.sort_values(by='UMI', ascending=False) + df_umi = df_umi.reset_index() + df_umi.to_csv(f'{self.outdir}/count.txt', sep='\t', index=False) + + UMI_num = int(self.cells) + rank = UMI_num / 100 + rank_UMI = df_umi.loc[rank, 'UMI'] + UMI_min = int(rank_UMI / 10) + + df_umi_filtered = df_umi[df_umi.UMI >= UMI_min] + + df_tmp = pd.merge(df_umi_filtered, barcodes, on='barcode', how='inner') + + matched_barcodes = df_tmp.barcode.tolist() + with open(f'{self.outdir}/{self.sample}_matched_barcodes.txt', 'w') as fh: + for barcode in matched_barcodes: + fh.write(str(barcode)+ '\n') + string = f'Get {len(matched_barcodes)} matched barcodes' + + Matching.cut_off.logger.info(string) + + df_all = pd.merge(df_tmp, df, on='barcode', how='outer') + seq_list = df_all['seq_name'].tolist() + + with open(f'{self.outdir}/seqlist.txt', 'w') as fh: + for name in seq_list: + fh.write(str(name)+'\n') + + + @utils.add_log + def getFqfile(self): + + cmd1 = ( + f'seqtk subseq {self.fq1} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R1.fq' + ) + os.system(cmd1) + + cmd2 = ( + f'seqtk subseq {self.fq2} {self.outdir}/seqlist.txt > {self.outdir}/{self.sample}_matched_R2.fq' + ) + os.system(cmd2) + + os.system(f'rm {self.outdir}/seqlist.txt') + + + @utils.add_log + def run(self): + self.cut_off() + self.getFqfile() + + +@utils.add_log +def matching(args): + step_name = 'matching' + match_obj = Matching(args, step_name) + match_obj.run() + + +def get_opts_matching(parser, sub_program): + if sub_program: + parser = s_common(parser) + parser.add_argument('--match_dir', help='rna analysis dir', required=True) + parser.add_argument('--fq1', help='R1 reads from convert step', required=True) + parser.add_argument('--fq2', help='R2 reads from convert step', required=True) + parser.add_argument('--cells', help='expected cell number', default=3000) + + diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py index bd5ecc1e..44e27e83 100644 --- a/celescope/trust_vdj/multi_trust_vdj.py +++ b/celescope/trust_vdj/multi_trust_vdj.py @@ -15,25 +15,40 @@ class Multi_trust_vdj(Multi): self.process_cmd(cmd, step, sample, m=5, x=1) - def assemble(self, sample): - step = 'assemble' + def matching(self, sample): + step = 'matching' cmd_line = self.get_cmd_line(step, sample) fq1 = f'{self.outdir_dic[sample]["convert"]}/{sample}_1.fq{self.fq_suffix}' - fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' + fq2 = f'{self.outdir_dic[sample]["convert"]}/{sample}_2.fq{self.fq_suffix}' cmd = ( f'{cmd_line} ' f'--fq1 {fq1} ' f'--fq2 {fq2} ' f'--match_dir {self.col4_dict[sample]}' ) + self.process_cmd(cmd, step, sample, m=5, x=3) + + + def assemble(self, sample): + step = 'assemble' + cmd_line = self.get_cmd_line(step, sample) + fq1 = f'{self.outdir_dic[sample]["matching"]}/{sample}_matched_R1.fq' + fq2 = f'{self.outdir_dic[sample]["matching"]}/{sample}_matched_R2.fq' + cmd = ( + f'{cmd_line} ' + f'--fq1 {fq1} ' + f'--fq2 {fq2} ' + ) self.process_cmd(cmd, step, sample, m=15, x=self.args.thread) def mapping(self, sample): step = 'mapping' cmd_line = self.get_cmd_line(step, sample) + fq = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_toassemble.fq' cmd = ( - f'{cmd_line}' + f'{cmd_line} ' + f'--fq {fq}' ) self.process_cmd(cmd, step, sample, m=5, x=5) @@ -41,8 +56,14 @@ class Multi_trust_vdj(Multi): def res_filter(self, sample): step = 'res_filter' cmd_line = self.get_cmd_line(step, sample) + report = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_barcode_report.tsv' + fa = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_annot.fa' + count_file = f'{self.outdir_dic[sample]["matching"]}/count.txt' cmd = ( f'{cmd_line} ' + f'--report {report} ' + f'--fa {fa} ' + f'--count_file {count_file} ' ) self.process_cmd(cmd, step, sample, m=5, x=1) diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index eb69837f..fabd63af 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -137,12 +137,15 @@ class Res_filter(Step): self.sample = args.sample self.Seqtype = args.Seqtype self.full_length = args.full_length + self.report = args.report + self.fa = args.fa + self.count_file = args.count_file @utils.add_log def run(self): - barcode_report = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_barcode_report.tsv' - fa = f'{self.outdir}/../02.trust_assemble/TRUST4/{self.sample}_annot.fa' + barcode_report = self.report + fa = self.fa df = beauty_report(barcode_report, fa) if self.full_length: @@ -152,7 +155,7 @@ class Res_filter(Step): clones, res_filter_summary = get_clone_table(df, self.Seqtype) # plot barcode umi - count_file = f'{self.outdir}/../02.trust_assemble/count.txt' + count_file = self.count_file df_umi = pd.read_csv(count_file, sep='\t', index_col=False) cells = set(df['barcode'].tolist()) df_umi['mark'] = df_umi['barcode'].apply(lambda x: 'CB' if (x in cells) else 'UB') @@ -212,4 +215,7 @@ def get_opts_res_filter(parser, sub_program): parser.add_argument('--Seqtype', help='TCR or BCR', choices=['TCR', 'BCR'], required=True) parser.add_argument('--full_length', help='only output full length assembly', action='store_true') if sub_program: - parser = s_common(parser) \ No newline at end of file + parser = s_common(parser) + parser.add_argument('--report', help='assemble report', required=True) + parser.add_argument('--fa', help='assembled fasta file', required=True) + parser.add_argument('--count_file', help='UMI count file', required=True) \ No newline at end of file -- Gitee From 3e506454fdb030eaa67efb8763019f9000728081 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 28 Jun 2021 11:26:08 +0800 Subject: [PATCH 94/96] add func doc and add rerun opt --- celescope/trust_vdj/assemble.py | 16 +++++++++++++--- celescope/trust_vdj/convert.py | 15 ++++++++++++++- celescope/trust_vdj/mapping.py | 6 ++++++ celescope/trust_vdj/matching.py | 15 ++++++++++++++- celescope/trust_vdj/multi_trust_vdj.py | 6 +++--- celescope/trust_vdj/res_filter.py | 9 +++++++++ 6 files changed, 59 insertions(+), 8 deletions(-) diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py index 6109d967..c4e0abdb 100644 --- a/celescope/trust_vdj/assemble.py +++ b/celescope/trust_vdj/assemble.py @@ -11,7 +11,17 @@ class Assemble(Step): """ Features - - Assemble TCR/BCR + - Assemble TCR/BCR seq data. + + Output + + - `03.assemble/{sample}_toassemble.fq` Reads to assemble. + - `03.assemble/{sample}_toassemble_bc.fa` Barcodes to assemble. + - `03.assemble/{sample}_cdr3.out` All assembled CDR3 output. + - `03.assemble/{sample}_barcode_report.tsv` Record chain information in each barcode. + - `03.assemble/{sample}_annot.fa` Assembled annotated contig sequences. + - `03.assemble/{sample}_assembled_reads.fa` Assembled raw reads. + - `03.assemble/{sample}_report.tsv` Record assembled CDR3 types and count. """ def __init__(self, args, step_name): @@ -44,12 +54,12 @@ class Assemble(Step): f'-f {index_file} ' f'--ref {ref} ' f'{string1}' - f'-o {self.sample} --od {self.outdir}/TRUST4' + f'-o {self.sample} --od {self.outdir}' ) Assemble.run.logger.info(cmd) - if not os.path.exists(f'{self.outdir}/TRUST4/{self.sample}_barcode_report.tsv'): + if not os.path.exists(f'{self.outdir}/{self.sample}_barcode_report.tsv'): os.system(cmd) #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' diff --git a/celescope/trust_vdj/convert.py b/celescope/trust_vdj/convert.py index 511e4bcb..440fdfe7 100644 --- a/celescope/trust_vdj/convert.py +++ b/celescope/trust_vdj/convert.py @@ -16,8 +16,21 @@ from celescope.tools.Step import Step, s_common class Convert(Step): - '''convert step class + ''' + Features + + - Demultiplex barcodes. + - Filter invalid R1 reads, which includes: + - Reads without linker: the mismatch between linkers and all linkers in the whitelist is greater than 2. + - Reads without correct barcode: the mismatch between barcodes and all barcodes in the whitelist is greater than 1. + - Reads without polyT: the number of T bases in the defined polyT region is less than 10. + - Low quality reads: low sequencing quality in barcode and UMI regions. + + Output + + - `01.convert/{sample}_2.fq(.gz)`, `01.convert/{sample}_2.fq(.gz)`. Barcode and UMI are contained in the R1 reads. ''' + def __init__(self, args, step_name): Step.__init__(self, args, step_name) diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py index d187befa..b6a5b09b 100644 --- a/celescope/trust_vdj/mapping.py +++ b/celescope/trust_vdj/mapping.py @@ -7,6 +7,12 @@ import re class Mapping(Step): + """ + Features + + - Calculate mapping rate of reads mapped to any V(D)J genes. + + """ def __init__(self, args, step_name): Step.__init__(self, args, step_name) diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py index 17c20d6f..2c5f1e33 100644 --- a/celescope/trust_vdj/matching.py +++ b/celescope/trust_vdj/matching.py @@ -28,6 +28,19 @@ def count_fq(fq): class Matching(Step): + """ + Features + + - Cut off V(D)J data by UMI count. Default value is 1/10 of the 30th barcode's UMIs ranked by UMI count. + - Match V(D)J barcodes after cut off with RNA cell barcodes. + + Output + + - `02.matching/count.txt`. Record the UMI count of each barcode in raw V(D)J data. + - `02.matching/{sample}_matched_barcodes.txt`. Contain the matched barcode. + - `02.matching/{sample}_matched_R1.fq`, `02.match/{sample}_matched_R2.fq. Barcode and UMI are contained in the R1 reads. + + """ def __init__(self, args, step_name): Step.__init__(self, args, step_name) @@ -87,7 +100,7 @@ class Matching(Step): Matching.cut_off.logger.info(string) - df_all = pd.merge(df_tmp, df, on='barcode', how='outer') + df_all = pd.merge(df_tmp, df, on='barcode', how='inner') seq_list = df_all['seq_name'].tolist() with open(f'{self.outdir}/seqlist.txt', 'w') as fh: diff --git a/celescope/trust_vdj/multi_trust_vdj.py b/celescope/trust_vdj/multi_trust_vdj.py index 44e27e83..bb331390 100644 --- a/celescope/trust_vdj/multi_trust_vdj.py +++ b/celescope/trust_vdj/multi_trust_vdj.py @@ -45,7 +45,7 @@ class Multi_trust_vdj(Multi): def mapping(self, sample): step = 'mapping' cmd_line = self.get_cmd_line(step, sample) - fq = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_toassemble.fq' + fq = f'{self.outdir_dic[sample]["assemble"]}//{sample}_toassemble.fq' cmd = ( f'{cmd_line} ' f'--fq {fq}' @@ -56,8 +56,8 @@ class Multi_trust_vdj(Multi): def res_filter(self, sample): step = 'res_filter' cmd_line = self.get_cmd_line(step, sample) - report = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_barcode_report.tsv' - fa = f'{self.outdir_dic[sample]["assemble"]}/TRUST4/{sample}_annot.fa' + report = f'{self.outdir_dic[sample]["assemble"]}//{sample}_barcode_report.tsv' + fa = f'{self.outdir_dic[sample]["assemble"]}//{sample}_annot.fa' count_file = f'{self.outdir_dic[sample]["matching"]}/count.txt' cmd = ( f'{cmd_line} ' diff --git a/celescope/trust_vdj/res_filter.py b/celescope/trust_vdj/res_filter.py index fabd63af..85b30209 100644 --- a/celescope/trust_vdj/res_filter.py +++ b/celescope/trust_vdj/res_filter.py @@ -129,6 +129,15 @@ def get_clone_table(df, Seqtype): class Res_filter(Step): + """ + Features + + - Calculate clonetypes. + + Output + - `05.res_filter/clonetypes.tsv` Record each clonetype and its frequent. + - `05.res_filter/{sample}_barcode_report.tsv` Record detailed chain information of each barcode. + """ def __init__(self, args, step_name): Step.__init__(self, args, step_name) -- Gitee From 1b29c57629bdeac5b9d96c6c828c5c1927397c9e Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 28 Jun 2021 11:26:32 +0800 Subject: [PATCH 95/96] add rerun option --- celescope/trust_vdj/assemble.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py index c4e0abdb..2c6e4531 100644 --- a/celescope/trust_vdj/assemble.py +++ b/celescope/trust_vdj/assemble.py @@ -33,6 +33,7 @@ class Assemble(Step): self.sample = args.sample self.species = args.species self.speed_up = args.speed_up + self.rerun = args.rerun @utils.add_log @@ -62,6 +63,9 @@ class Assemble(Step): if not os.path.exists(f'{self.outdir}/{self.sample}_barcode_report.tsv'): os.system(cmd) + if self.rerun: + os.system(cmd) + #fq = f'{self.outdir}/TRUST4/{self.sample}_toassemble.fq' @@ -79,6 +83,7 @@ def get_opts_assemble(parser, sub_program): parser.add_argument('--fq2', help='R2 reads from match step', required=True) parser.add_argument('--species', help='species', choices=["Mmus", "Hsap"], required=True) + parser.add_argument('--rerun', help='Re-run the assemble step', action='store_true') parser.add_argument('--speed_up', help='speed assemble for TCR/BCR seq data', action='store_true') -- Gitee From 09bf64bc3aad3129fddef68625fb48743fd11b52 Mon Sep 17 00:00:00 2001 From: zhouxinseeu Date: Mon, 28 Jun 2021 11:28:52 +0800 Subject: [PATCH 96/96] rm unused import --- celescope/trust_vdj/assemble.py | 1 - celescope/trust_vdj/mapping.py | 1 - celescope/trust_vdj/matching.py | 1 - 3 files changed, 3 deletions(-) diff --git a/celescope/trust_vdj/assemble.py b/celescope/trust_vdj/assemble.py index 2c6e4531..d0c12d11 100644 --- a/celescope/trust_vdj/assemble.py +++ b/celescope/trust_vdj/assemble.py @@ -1,7 +1,6 @@ import os from celescope.tools import utils from celescope.tools.Step import Step, s_common -import pandas as pd TRUST = '/SGRNJ03/randd/zhouxin/software/TRUST4/' diff --git a/celescope/trust_vdj/mapping.py b/celescope/trust_vdj/mapping.py index b6a5b09b..52304187 100644 --- a/celescope/trust_vdj/mapping.py +++ b/celescope/trust_vdj/mapping.py @@ -1,5 +1,4 @@ import pandas as pd -import glob from celescope.tools.Step import Step, s_common from celescope.tools import utils import os diff --git a/celescope/trust_vdj/matching.py b/celescope/trust_vdj/matching.py index 2c5f1e33..0646a420 100644 --- a/celescope/trust_vdj/matching.py +++ b/celescope/trust_vdj/matching.py @@ -5,7 +5,6 @@ import pysam import pandas as pd from collections import defaultdict import glob -import re from Bio.Seq import Seq -- Gitee