代码拉取完成,页面将自动刷新
#!/usr/bin/env python3
# Author: Armit
# Create Time: 2023/01/09
# Clustering for chain sequence
#
# NOTE: 这个仅作参考,不一定正确
# - 非自然地修改某些关键 residue 可能会导致 model 空间结构大变
from pathlib import Path
import pickle as pkl
from argparse import ArgumentParser
from typing import List
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearnex import patch_sklearn ; patch_sklearn()
from Levenshtein import distance
from data import DATA_PATH
def get_data() -> List[str]:
with open(Path(DATA_PATH) / 'sequence_uniq.txt') as fh:
return fh.read().strip().split('\n')
def get_hash(s:str) -> int:
h = 0
for c in s:
h = h << 7 + h >> 5 + ord(c)
h %= 1000000009
return h
def get_distmat(seqs:List[str]) -> np.ndarray:
fp = Path(DATA_PATH) / 'sequence_uniq.distmat.pkl'
h = get_hash(''.join(seqs))
if fp.exists():
print('>> try load cached dist matrix')
with open(fp, 'rb') as fh:
dists, h_saved = pkl.load(fh)
if h_saved == h:
return dists
else:
print('<< cache deprecated due to hash mismatch')
print('>> precomputed dist matrix')
n_seqs = len(seqs)
dists = np.zeros(shape=(n_seqs, n_seqs))
for i in range(1, n_seqs - 1):
for j in range(i + 1, n_seqs):
dists[i, j] = dists[j, i] = distance(seqs[i], seqs[j], weights=(1, 1, 1))
with open(fp, 'wb') as fh:
pkl.dump((dists, h), fh)
return dists
def cluster(args):
seqs = get_data()
print(f'>> loaded {len(seqs)} seqs')
dists = get_distmat(seqs)
if True:
plt.hist(dists.flatten(), bins=100)
plt.tight_layout()
plt.show()
model = AgglomerativeClustering(n_clusters=args.n, affinity='precomputed', linkage=args.linkage)
pred = model.fit_predict(dists)
if True:
plt.hist(pred.flatten(), bins=100)
plt.tight_layout()
plt.show()
breakpoint()
if __name__ == '__main__':
parser = ArgumentParser()
parser.add_argument('-n', default=12117, type=int, help='n_clusters')
parser.add_argument('-L', '--linkage', default='average', choices=['complete', 'average', 'single'])
args = parser.parse_args()
cluster(args)
此处可能存在不合适展示的内容,页面不予展示。您可通过相关编辑功能自查并修改。
如您确认内容无涉及 不当用语 / 纯广告导流 / 暴力 / 低俗色情 / 侵权 / 盗版 / 虚假 / 无价值内容或违法国家有关法律法规的内容,可点击提交进行申诉,我们将尽快为您处理。