"HHR" 文件格式通常与HMMER(Hidden Markov Model for Molecular Evolution)软件相关联,该软件用于执行隐马尔可夫模型的序列比对和搜索。HMMER用于生物信息学中的蛋白质和核酸序列比对任务,例如蛋白质家族分类、序列相似性搜索等。HHR 文件是HMMER比对结果的一种输出格式,它包含了与序列比对相关的信息。
- import dataclasses
- import re
- from typing import List, Optional, Sequence
-
- @dataclasses.dataclass(frozen=True)
- class TemplateHit:
- """Class representing a template hit."""
- index: int
- name: str
- aligned_cols: int
- sum_probs: Optional[float]
- query: str
- hit_sequence: str
- indices_query: List[int]
- indices_hit: List[int]
-
-
- def _get_hhr_line_regex_groups(
- regex_pattern: str, line: str) -> Sequence[Optional[str]]:
- match = re.match(regex_pattern, line)
- if match is None:
- raise RuntimeError(f'Could not parse query line {line}')
- return match.groups()
-
-
-
- def _update_hhr_residue_indices_list(
- sequence: str, start_index: int, indices_list: List[int]):
- """Computes the relative indices for each residue with respect to the original sequence."""
- counter = start_index
- for symbol in sequence:
- if symbol == '-':
- indices_list.append(-1)
- else:
- indices_list.append(counter)
- counter += 1
-
-
- def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
- """Parses the detailed HMM HMM comparison section for a single Hit.
- This works on .hhr files generated from both HHBlits and HHSearch.
- Args:
- detailed_lines: A list of lines from a single comparison section between 2
- sequences (which each have their own HMM's)
- Returns:
- A dictionary with the information from that detailed comparison section
- Raises:
- RuntimeError: If a certain line cannot be processed
- """
- # Parse first 2 lines.
- number_of_hit = int(detailed_lines[0].split()[-1])
- name_hit = detailed_lines[1][1:]
-
- # Parse the summary line.
- #pattern = (
- # 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
- # ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
- # ']*Template_Neff=(.*)')
-
- # "()":标记一个子表达式的开始和结束位置.
- # '.':匹配除换行符 \n 之外的任何单字符
- # '*': 匹配前面的子表达式零次或多次。
- # '[]':匹配其中列举的字符,可以是很多单个,也可以范围
- # '\t': 换行符
-
- pattern = (
- 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
- ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)')
-
- # Probab=100.00 E-value=1.5e-142 Score=1017.19 Aligned_cols=366 Identities=56% Similarity=1.021 Sum_probs=364.0
- match = re.match(pattern, detailed_lines[2])
- # 示例detailed_lines[2]:Probab=91.65 E-value=0.12 Score=40.00 Aligned_cols=62 Identities=16% Similarity=0.149 Sum_probs=42.0
- # print(detailed_lines[2])
- # print(match)
-
- if match is None:
- raise RuntimeError(
- 'Could not parse section: %s. Expected this: \n%s to contain summary.' %
- (detailed_lines, detailed_lines[2]))
- (_, _, _, aligned_cols, _, _, sum_probs) = [float(x)
- for x in match.groups()]
-
- # The next section reads the detailed comparisons. These are in a 'human
- # readable' format which has a fixed length. The strategy employed is to
- # assume that each block starts with the query sequence line, and to parse
- # that with a regexp in order to deduce the fixed length used for that block.
- query = ''
- hit_sequence = ''
- indices_query = []
- indices_hit = []
- length_block = None
-
- for line in detailed_lines[3:]:
- # Parse the query sequence line
- if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
- not line.startswith('Q ss_pred') and
- not line.startswith('Q Consensus')):
- # Thus the first 17 characters must be 'Q
' , and we can parse - # everything after that.
- # start sequence end total_sequence_length
- patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
- # pattern示例:306 TATYDFKMADLQQVAPEATVRRFLQGRRNLAKVCALLRGYLLPGAPADL 355 (431)
- groups = _get_hhr_line_regex_groups(patt, line[17:])
-
- #print("groups")
- #print(groups)
-
- # Get the length of the parsed block using the start and finish indices,
- # and ensure it is the same as the actual block length.
- start = int(groups[0]) - 1 # Make index zero based.
- delta_query = groups[1]
- end = int(groups[2])
- num_insertions = len([x for x in delta_query if x == '-'])
- length_block = end - start + num_insertions
- assert length_block == len(delta_query)
-
- # Update the query sequence and indices list.
- query += delta_query
- _update_hhr_residue_indices_list(delta_query, start, indices_query)
-
- elif line.startswith('T '):
- # Parse the hit sequence.
- if (not line.startswith('T ss_dssp') and
- not line.startswith('T ss_pred') and
- not line.startswith('T Consensus')):
- # Thus the first 17 characters must be 'T
' , and we can - # parse everything after that.
- # start sequence end total_sequence_length
- patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
- groups = _get_hhr_line_regex_groups(patt, line[17:])
- start = int(groups[0]) - 1 # Make index zero based.
- delta_hit_sequence = groups[1]
- assert length_block == len(delta_hit_sequence)
-
- # Update the hit sequence and indices list.
- hit_sequence += delta_hit_sequence
- _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
-
- return TemplateHit(
- index=number_of_hit,
- name=name_hit,
- aligned_cols=int(aligned_cols),
- sum_probs=sum_probs,
- query=query,
- hit_sequence=hit_sequence,
- indices_query=indices_query,
- indices_hit=indices_hit,
- )
-
-
- def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
- """Parses the content of an entire HHR file."""
- lines = hhr_string.splitlines()
-
- # Each .hhr file starts with a results table, then has a sequence of hit
- # "paragraphs", each paragraph starting with a line 'No
' . We - # iterate through each paragraph to parse each hit.
-
- block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]
-
- hits = []
- if block_starts:
- block_starts.append(len(lines)) # Add the end of the final block.
- for i in range(len(block_starts) - 1):
- hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
- return hits
-
-
- ## query.hhr文件路径 path/to/hh-suite/data/query.hhr
- with open("query.hhr") as f:
- hhr_str = f.read()
-
- hits = parse_hhr(hhr_str)
- print(len(hits))
- print(hits)
参考: