-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy paths2_hvg.py
94 lines (79 loc) · 2.75 KB
/
s2_hvg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import argparse
# deal with input
parser = argparse.ArgumentParser(description='Select hvg, perform dimensionality reduction and clustering')
parser.add_argument('-n', '--number', help='the number of hvgs selected', default=2000, type=int)
parser.add_argument('-p', '--prefix', help='prefix of the input file', required=True, type=str)
parser.add_argument('-b', '--batch', help='batch key of the file', default='patient', type=str, required=False)
parser.add_argument('--plot_major_lineage', required=False, default=True, type=bool)
args = parser.parse_args()
import os
import re
import sys
import traceback
import warnings
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
from util import *
sc.settings.verbosity = 3
warnings.filterwarnings("ignore")
sc.settings.set_figure_params(
dpi=80,
facecolor='white',
frameon=False
)
n = args.number
prefix = args.prefix
batchkey = args.batch
input_file = prefix + '.h5ad'
identifier = ''.join(prefix.split('_')[1:-1])
hvg_file = Path(f"d3_{identifier}_hvg.h5ad")
umap_file = Path(f"d4_{identifier}_umap.h5ad")
print(f'''
-------------------------
Input file:
{input_file}
Do:
remove doublets
select {n} hvgs
Output file:
{hvg_file}
{umap_file}
-------------------------
''')
# read data
try:
adata = sc.read_h5ad(input_file)
print(adata)
if batchkey not in adata.obs.columns:
print(f'batch key {batchkey} not found in obs, please check')
sys.exit(1)
# choose hvg
adata.uns['log1p']['base'] = None
adata.obs[batchkey] = adata.obs[batchkey].astype('category')
sc.pp.highly_variable_genes(adata, n_top_genes=n, flavor='seurat', n_bins=20, inplace=True, batch_key=batchkey) # seurat_v3 is for count data, seurat is for log data
adata.raw = adata
adata = adata[:, adata.var['highly_variable']]
sc.pp.pca(adata, n_comps=50, use_highly_variable=True, svd_solver='arpack')
adata.write_h5ad(hvg_file)
print(f'Highly variable genes (n={n}) are saved to {hvg_file}')
sc.pp.neighbors(adata)
sc.tl.umap(adata)
sc.tl.leiden(adata)
sc.pl.umap(adata, color=['leiden', batchkey], legend_loc='on data', legend_fontsize='small', show=False, save='_batch_before_bbknn.pdf')
sc.pl.umap(adata, color=['doublet', 'doublet_score'], cmap='Reds', show=False)
save_fig_to_path('doublet_umap.pdf', 'figures')
sc.pl.violin(adata, "doublet_score")
save_fig_to_path('doublet_violin_plot.pdf', 'figures')
adata.write_h5ad(umap_file) # type: ignore
if args.plot_major_lineage:
plot_major_lineage(adata)
print('Finish')
except Exception as e:
print('Error:\n', e)
traceback.print_exc(file=sys.stdout)
print('Current adata object:\n', adata)
sys.exit(1)