diff --git a/.gitignore b/.gitignore index 37518ec5387a253b6876dc69b0ceed42d2ddeaee..be308f442650aa382abbf6c33159d04d05523b91 100755 --- a/.gitignore +++ b/.gitignore @@ -149,4 +149,6 @@ cython_debug/ #temp /temp/ +# .DS_store +.DS_store diff --git a/celescope/tools/auto_assign.R b/celescope/tools/auto_assign.R index efe4a810cf1f527fc63e35d4e7b49386f2d7a651..f33b8e37b9722cfb79bf873682c7bf722697b677 100755 --- a/celescope/tools/auto_assign.R +++ b/celescope/tools/auto_assign.R @@ -31,7 +31,7 @@ n_cell_name <- length(cell_name) #reset #all_data <- SetAllIdent(object = all_data, id = origin.cluster) -clusters <- sort(unique(all_data@ident)) +clusters <- sort(unique(all_data@active.ident)) #create dir auto_dir <- stringr::str_glue('{outdir}/{sample}_auto_assign/') @@ -47,9 +47,9 @@ for (cluster in clusters){ index = index + 1 pos = unlist(strsplit(marker_file[index,2,drop=T],",")) neg = tryCatch(unlist(strsplit(marker_file[index,3,drop=T],",")) ,error=function(e){} ) - for (feature in pos){ + for (F in pos){ tryCatch({ - dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) + dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) dat$cell_type <- cell dat$cluster <- cluster dat <- rownames_to_column(dat,var="gene") @@ -61,13 +61,13 @@ for (cluster in clusters){ all_dat <- rbind(all_dat,dat) } } - ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) }) + ,error=function(e){print(paste0(F," not found in cluster ",cluster)) }) } if (!is.na(neg) && !is.null(neg)){ - for (feature in neg){ + for (F in neg){ tryCatch({ - dat <- FindMarkers(all_data,genes.use=feature,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) + dat <- FindMarkers(all_data,feature=F,ident.1=cluster,min.pct = 0,logfc.threshold = -Inf) dat$cell_type <- cell dat$cluster <- cluster dat <- rownames_to_column(dat,var="gene") @@ -79,13 +79,14 @@ for (cluster in clusters){ all_dat <- rbind(all_dat,dat) } } - ,error=function(e){print(paste0(feature," not found in cluster ",cluster)) }) + ,error=function(e){print(paste0(F," not found in cluster ",cluster)) }) } } } } +all_dat$cluster <- as.numeric(all_dat$cluster) + 1 all_dat <- mutate(all_dat,pct.diff=pct.1-pct.2) exp.out = stringr::str_glue('{auto_dir}/{sample}_type_marker_exp.tsv') write_tsv(all_dat, exp.out) @@ -109,16 +110,16 @@ for (cluster in clusters){ dev.off() png(paste0(png_dir,cluster,"_logfc.png"),width=1200,height=1000) - p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_logFC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2) + p2 <- ggplot(c,aes(x=interaction(gene,cell_type,type),avg_log2FC,fill=cell_type)) +geom_bar(stat="identity")+ coord_flip() + scale_fill_manual(values=color2) print (p2) dev.off() } # auto assign -exp[exp$type=="negative",]$avg_logFC = -(exp[exp$type=="negative",]$avg_logFC) +exp[exp$type=="negative",]$avg_log2FC = -(exp[exp$type=="negative",]$avg_log2FC) exp[exp$type=="negative",]$pct.diff = -(exp[exp$type=="negative",]$pct.diff) a <- group_by(exp,cluster,cell_type) -as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_logFC),max_p_val_adj=max(p_val_adj)) +as <- summarize(a,avg_pct.diff=mean(pct.diff),avg_logfc=mean(avg_log2FC),max_p_val_adj=max(p_val_adj)) as1 <- group_by(ungroup(as),cluster) as1 <- mutate(as1,pct_rank = rank(avg_pct.diff), logfc_rank= rank(avg_logfc),total_rank=pct_rank+logfc_rank) diff --git a/celescope/tools/run_analysis.R b/celescope/tools/run_analysis.R index 5e0b55922a4d6fb33e1a50642118a9231c072784..5883c06f6f2f4586cd06efbde017229224064771 100755 --- a/celescope/tools/run_analysis.R +++ b/celescope/tools/run_analysis.R @@ -1,9 +1,12 @@ -library(Seurat) +library(Seurat) # v4.0 library(tidyverse) library(argparser) +library(hdf5r) +library(rhdf5) + argv <- arg_parser('') -argv <- add_argument(argv,"--matrix_file", help="matrix file") +argv <- add_argument(argv,"--matrix_dir", help="cell 10X matrix dir") argv <- add_argument(argv,"--outdir", help="outdir") argv <- add_argument(argv,"--sample", help="sample") argv <- add_argument(argv,"--save_rds", help="write rds to disk") @@ -17,18 +20,33 @@ save_rds = argv$save_rds resolution = 0.6 res_str = paste0('res.', resolution) -matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "") +matrix = Seurat::Read10X(matrix_dir, gene.column=2) tsne.out = stringr::str_glue('{outdir}/{sample}_tsne_coord.tsv') marker.out = stringr::str_glue('{outdir}/{sample}_markers.tsv') mito.out = paste(outdir,"stat.txt",sep="/") rds.out = paste0(outdir,'/',sample,'.rds') +# read 10X +matrix = read.table(matrix_file,sep="\t",header=TRUE,row.names=1,quote = "") +rds = CreateSeuratObject(matrix, pro=sample) -rds = CreateSeuratObject(raw.data = matrix,project=sample) +# generate h5ad file +x = GetAssayData(rds,slot="count") +mtx = as.matrix(x) +barcode = colnames(rds) +geneid = rownames(rds) +h5.out = stringr::str_glue('{outdir}/{sample}.h5') +path <- path.expand(h5.out) +h5createFile(path) +h5f <- H5Fopen(path) +h5writeDataset(mtx,h5f,"X") +h5writeDataset(barcode,h5f,"obs") +h5writeDataset(geneid,h5f,"var") +H5Fclose(h5f) # mito -mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@data), value = TRUE, ignore.case=TRUE) -percent.mito <- Matrix::colSums(rds@raw.data[mito.genes,])/Matrix::colSums(rds@raw.data) +mito.genes <- grep(pattern = "^MT-", x = rownames(x = rds@assays$RNA@data), value = TRUE, ignore.case=TRUE) +percent.mito <- Matrix::colSums(rds@assays$RNA@counts[mito.genes,])/Matrix::colSums(rds@assays$RNA@counts) rds <- AddMetaData(object = rds, metadata = percent.mito, col.name = "percent.mito") meta = rds@meta.data total_cell = dim(meta)[1] @@ -43,44 +61,53 @@ mito_df$cell_percent = paste0(round(mito_df$cell_percent * 100,2),"%") mito_df$mito_percent = paste0("Fraction of cells have mito gene percent>",round(mito_df$mito_percent * 100,2),"%") write_delim(mito_df, mito.out, col_names=F, delim=":") -rds <- NormalizeData(object = rds, normalization.method = "LogNormalize",scale.factor = 10000) -rds <- FindVariableGenes(object = rds, mean.function = ExpMean, dispersion.function = LogVMR, x.low.cutoff = 0.1, y.cutoff = 1, do.contour=F) -use.gene = rds@var.genes -rds <- ScaleData(object = rds,vars.to.regress = c("nUMI", "percent.mito"),genes.use =use.gene) -rds <- RunPCA(object = rds, pc.genes = use.gene, do.print = FALSE) -rds <- FindClusters(object = rds, reduction.type = "pca", dims.use = 1:20, resolution = resolution, print.output = 0, save.SNN = TRUE,force.recalc = TRUE) -rds@meta.data[[res_str]] = as.numeric(rds@meta.data[[res_str]]) + 1 -rds = SetAllIdent(rds, res_str) - -# Run Non-linear dimensional reduction (tSNE) -rds <- RunTSNE(object = rds, dims.use = 1:20, do.fast = TRUE,check_duplicates = FALSE) + +rds <- NormalizeData(rds, normalization.method = "LogNormalize",scale.factor = 10000) +rds <- FindVariableFeatures(rds, selection.method = "vst", nfeatures = 2000, mean.cutoff = c(0.1, 8), dispersion.cutoff = c(1, Inf), + mean.function = ExpMean, dispersion.function = LogVMR) + +use.genes <- rds@assays$RNA@var.features +rds <- ScaleData(rds, vars.to.regress = c("nCount_RNA", "percent.mito"), features = use.genes) +rds <- RunPCA(object = rds, features = use.genes, do.print = FALSE) +rds <- FindNeighbors(rds, dims = 1:20, force.recalc = TRUE, reduction = "pca") +rds <- FindClusters(rds, resolution = resolution) + +# tsne and umap +rds <- RunTSNE(rds, dims = 1:20, do.fast = TRUE, check_duplicates = FALSE) + + tryCatch({ - rds.markers <- FindAllMarkers(object = rds, genes.use = use.gene) - rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_logFC)) + rds.markers <- FindAllMarkers(object = rds, features = use.genes) + rds.markers = dplyr::group_by(rds.markers,cluster) %>% dplyr::arrange(desc(avg_log2FC)) }, error = function(e){ print (paste0("no marker found: ", e)) rds.markers <<- data.frame(cluster=double(), - gene=double(), - avg_logFC=double(), - pct.1=double(), - pct.2=double(), - p_val_adj=double()) + gene=double(), + avg_log2FC=double(), + pct.1=double(), + pct.2=double(), + p_val_adj=double()) }) + +rds.markers$cluster = as.numeric(rds.markers$cluster) print (rds.markers) write_tsv(rds.markers,marker.out,col_names = T) -df.tsne = rds@dr$tsne@cell.embeddings + +df.tsne = rds@reductions$tsne@cell.embeddings df.tsne = as.data.frame(df.tsne) meta = rds@meta.data -dic = rds@meta.data[[res_str]] +dic = rds@meta.data[['seurat_clusters']] names(dic) = rownames(rds@meta.data) df.tsne$cluster = as.numeric(dic[rownames(df.tsne)]) -df.gene = meta[,"nGene",drop=F] +rds@meta.data$seurat_clusters = as.numeric(dic[rownames(df.tsne)]) +df.gene = meta[,"nFeature_RNA",drop=F] colnames(df.gene) = "Gene_Counts" df.all = cbind(df.tsne,df.gene) write.table(df.all,tsne.out,sep="\t",col.names=NA,quote = F) + if (save_rds == 'True'){ saveRDS(rds, rds.out) } \ No newline at end of file