1 Star 0 Fork 0

Kahsolt/PDB-analyze

加入 Gitee
与超过 1200万 开发者一起发现、参与优秀开源项目,私有仓库也完全免费 :)
免费加入
文件
克隆/下载
cls_seqs.py 2.17 KB
一键复制 编辑 原始数据 按行查看 历史
Kahsolt 提交于 2023-01-11 18:16 . merge repo
#!/usr/bin/env python3
# Author: Armit
# Create Time: 2023/01/09
# Clustering for chain sequence
#
# NOTE: 这个仅作参考,不一定正确
# - 非自然地修改某些关键 residue 可能会导致 model 空间结构大变
from pathlib import Path
import pickle as pkl
from argparse import ArgumentParser
from typing import List
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearnex import patch_sklearn ; patch_sklearn()
from Levenshtein import distance
from data import DATA_PATH
def get_data() -> List[str]:
with open(Path(DATA_PATH) / 'sequence_uniq.txt') as fh:
return fh.read().strip().split('\n')
def get_hash(s:str) -> int:
h = 0
for c in s:
h = h << 7 + h >> 5 + ord(c)
h %= 1000000009
return h
def get_distmat(seqs:List[str]) -> np.ndarray:
fp = Path(DATA_PATH) / 'sequence_uniq.distmat.pkl'
h = get_hash(''.join(seqs))
if fp.exists():
print('>> try load cached dist matrix')
with open(fp, 'rb') as fh:
dists, h_saved = pkl.load(fh)
if h_saved == h:
return dists
else:
print('<< cache deprecated due to hash mismatch')
print('>> precomputed dist matrix')
n_seqs = len(seqs)
dists = np.zeros(shape=(n_seqs, n_seqs))
for i in range(1, n_seqs - 1):
for j in range(i + 1, n_seqs):
dists[i, j] = dists[j, i] = distance(seqs[i], seqs[j], weights=(1, 1, 1))
with open(fp, 'wb') as fh:
pkl.dump((dists, h), fh)
return dists
def cluster(args):
seqs = get_data()
print(f'>> loaded {len(seqs)} seqs')
dists = get_distmat(seqs)
if True:
plt.hist(dists.flatten(), bins=100)
plt.tight_layout()
plt.show()
model = AgglomerativeClustering(n_clusters=args.n, affinity='precomputed', linkage=args.linkage)
pred = model.fit_predict(dists)
if True:
plt.hist(pred.flatten(), bins=100)
plt.tight_layout()
plt.show()
breakpoint()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-n', default=12117, type=int, help='n_clusters')
parser.add_argument('-L', '--linkage', default='average', choices=['complete', 'average', 'single'])
args = parser.parse_args()
cluster(args)
Loading...
马建仓 AI 助手
尝试更多
代码解读
代码找茬
代码优化
1
https://gitee.com/kahsolt/pdb-analyze.git
git@gitee.com:kahsolt/pdb-analyze.git
kahsolt
pdb-analyze
PDB-analyze
master

搜索帮助