当前位置：首页 > news >正文

TemplateHit中提取query和hit比对上序列索引的映射字典

news 来源：原创 2024/5/14 11:59:44

template_hits(Sequence[TemplateHit]数据格式)来自结构数据库搜索结果 python运行hhsearch二进制命令的包装器类映射索引计算：TemplateHit 中含有 indices_query，需要换算成在原始query序列中的index，hit 中indices_hit 需要减去最小index（-1 gap 除外）

import pickle 
import dataclasses
from typing import Optional, List, Sequence, Mapping@dataclasses.dataclass(frozen=True)
class TemplateHit:"""Class representing a template hit."""index: intname: straligned_cols: intsum_probs: Optional[float]query: strhit_sequence: strindices_query: List[int]indices_hit: List[int]### 读入Sequence[TemplateHit]数据
with open('test_pdb_hits.pkl', 'rb') as file:# 使用 pickle.load 从文件中加载对象test_pdb_hits = pickle.load(file)#test_pdb_hits.pkl由python运行hhsearch二进制命令的包装器类 的结果 template_hits 保存得到
#import pickle
#with open('test_pdb_hits.pkl', 'wb') as file:
#  pickle.dump(template_hits, file)def build_query_to_hit_index_mapping(hit_query_sequence: str,hit_sequence: str,indices_hit: Sequence[int],indices_query: Sequence[int],original_query_sequence: str) -> Mapping[int, int]:"""Gets mapping from indices in original query sequence to indices in the hit.hit_query_sequence and hit_sequence are two aligned sequences containing gapcharacters. hit_query_sequence contains only the part of the original querysequence that matched the hit. When interpreting the indices from the .hhr, weneed to correct for this to recover a mapping from original query sequence tothe hit sequence.Args:hit_query_sequence: The portion of the query sequence that is in the .hhrhithit_sequence: The portion of the hit sequence that is in the .hhrindices_hit: The indices for each aminoacid relative to the hit sequenceindices_query: The indices for each aminoacid relative to the original querysequenceoriginal_query_sequence: String describing the original query sequence.Returns:Dictionary with indices in the original query sequence as keys and indicesin the hit sequence as values."""# If the hit is empty (no aligned residues), return empty mappingif not hit_query_sequence:return {}# Remove gaps and find the offset of hit.query relative to original query.hhsearch_query_sequence = hit_query_sequence.replace('-', '')hit_sequence = hit_sequence.replace('-', '')hhsearch_query_offset = original_query_sequence.find(hhsearch_query_sequence)print(f"hhsearch_query_offset:{hhsearch_query_offset}")# Index of -1 used for gap characters. Subtract the min index ignoring gaps.min_idx = min(x for x in indices_hit if x > -1)fixed_indices_hit = [x - min_idx if x > -1 else -1 for x in indices_hit]print(f"fixed_indices_hit:{fixed_indices_hit}")min_idx = min(x for x in indices_query if x > -1)fixed_indices_query = [x - min_idx if x > -1 else -1 for x in indices_query]print(f"fixed_indices_query:{fixed_indices_query}")# Zip the corrected indices, ignore case where both seqs have gap characters.mapping = {}for q_i, q_t in zip(fixed_indices_query, fixed_indices_hit):if q_t != -1 and q_i != -1:if (q_t >= len(hit_sequence) orq_i + hhsearch_query_offset >= len(original_query_sequence)):continuemapping[q_i + hhsearch_query_offset] = q_treturn mappinghit = test_pdb_hits[0]
input_fasta_file = 'Q94K49.fasta'
## 从fasta文件提取 query_sequence（str格式）
query_sequence = ""
with open(input_fasta_file) as f:for line in f.readlines():if line.startswith(">"):continuequery_sequence += line.strip()print(f"hit.query:{hit.query}")
print(f"hit.hit_sequence:{hit.hit_sequence}")
print(f"hit.indices_hit:{hit.indices_hit}")
print(f"hit.indices_query:{hit.indices_query}")
print(f"query_sequence:{query_sequence}")##query和hit序列比对上的氨基酸在各自多肽链上索引的对应字典
mapping = build_query_to_hit_index_mapping(hit.query, hit.hit_sequence, hit.indices_hit, hit.indices_query,query_sequence)
print(mapping)