• python解析.hhr文件


    "HHR" 文件格式通常与HMMER(Hidden Markov Model for Molecular Evolution)软件相关联,该软件用于执行隐马尔可夫模型的序列比对和搜索。HMMER用于生物信息学中的蛋白质和核酸序列比对任务,例如蛋白质家族分类、序列相似性搜索等。HHR 文件是HMMER比对结果的一种输出格式,它包含了与序列比对相关的信息。

     hhr文件格式介绍

    1. import dataclasses
    2. import re
    3. from typing import List, Optional, Sequence
    4. @dataclasses.dataclass(frozen=True)
    5. class TemplateHit:
    6. """Class representing a template hit."""
    7. index: int
    8. name: str
    9. aligned_cols: int
    10. sum_probs: Optional[float]
    11. query: str
    12. hit_sequence: str
    13. indices_query: List[int]
    14. indices_hit: List[int]
    15. def _get_hhr_line_regex_groups(
    16. regex_pattern: str, line: str) -> Sequence[Optional[str]]:
    17. match = re.match(regex_pattern, line)
    18. if match is None:
    19. raise RuntimeError(f'Could not parse query line {line}')
    20. return match.groups()
    21. def _update_hhr_residue_indices_list(
    22. sequence: str, start_index: int, indices_list: List[int]):
    23. """Computes the relative indices for each residue with respect to the original sequence."""
    24. counter = start_index
    25. for symbol in sequence:
    26. if symbol == '-':
    27. indices_list.append(-1)
    28. else:
    29. indices_list.append(counter)
    30. counter += 1
    31. def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
    32. """Parses the detailed HMM HMM comparison section for a single Hit.
    33. This works on .hhr files generated from both HHBlits and HHSearch.
    34. Args:
    35. detailed_lines: A list of lines from a single comparison section between 2
    36. sequences (which each have their own HMM's)
    37. Returns:
    38. A dictionary with the information from that detailed comparison section
    39. Raises:
    40. RuntimeError: If a certain line cannot be processed
    41. """
    42. # Parse first 2 lines.
    43. number_of_hit = int(detailed_lines[0].split()[-1])
    44. name_hit = detailed_lines[1][1:]
    45. # Parse the summary line.
    46. #pattern = (
    47. # 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
    48. # ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
    49. # ']*Template_Neff=(.*)')
    50. # "()":标记一个子表达式的开始和结束位置.
    51. # '.':匹配除换行符 \n 之外的任何单字符
    52. # '*': 匹配前面的子表达式零次或多次。
    53. # '[]':匹配其中列举的字符,可以是很多单个,也可以范围
    54. # '\t': 换行符
    55. pattern = (
    56. 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
    57. ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)')
    58. # Probab=100.00 E-value=1.5e-142 Score=1017.19 Aligned_cols=366 Identities=56% Similarity=1.021 Sum_probs=364.0
    59. match = re.match(pattern, detailed_lines[2])
    60. # 示例detailed_lines[2]:Probab=91.65 E-value=0.12 Score=40.00 Aligned_cols=62 Identities=16% Similarity=0.149 Sum_probs=42.0
    61. # print(detailed_lines[2])
    62. # print(match)
    63. if match is None:
    64. raise RuntimeError(
    65. 'Could not parse section: %s. Expected this: \n%s to contain summary.' %
    66. (detailed_lines, detailed_lines[2]))
    67. (_, _, _, aligned_cols, _, _, sum_probs) = [float(x)
    68. for x in match.groups()]
    69. # The next section reads the detailed comparisons. These are in a 'human
    70. # readable' format which has a fixed length. The strategy employed is to
    71. # assume that each block starts with the query sequence line, and to parse
    72. # that with a regexp in order to deduce the fixed length used for that block.
    73. query = ''
    74. hit_sequence = ''
    75. indices_query = []
    76. indices_hit = []
    77. length_block = None
    78. for line in detailed_lines[3:]:
    79. # Parse the query sequence line
    80. if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
    81. not line.startswith('Q ss_pred') and
    82. not line.startswith('Q Consensus')):
    83. # Thus the first 17 characters must be 'Q ', and we can parse
    84. # everything after that.
    85. # start sequence end total_sequence_length
    86. patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
    87. # pattern示例:306 TATYDFKMADLQQVAPEATVRRFLQGRRNLAKVCALLRGYLLPGAPADL 355 (431)
    88. groups = _get_hhr_line_regex_groups(patt, line[17:])
    89. #print("groups")
    90. #print(groups)
    91. # Get the length of the parsed block using the start and finish indices,
    92. # and ensure it is the same as the actual block length.
    93. start = int(groups[0]) - 1 # Make index zero based.
    94. delta_query = groups[1]
    95. end = int(groups[2])
    96. num_insertions = len([x for x in delta_query if x == '-'])
    97. length_block = end - start + num_insertions
    98. assert length_block == len(delta_query)
    99. # Update the query sequence and indices list.
    100. query += delta_query
    101. _update_hhr_residue_indices_list(delta_query, start, indices_query)
    102. elif line.startswith('T '):
    103. # Parse the hit sequence.
    104. if (not line.startswith('T ss_dssp') and
    105. not line.startswith('T ss_pred') and
    106. not line.startswith('T Consensus')):
    107. # Thus the first 17 characters must be 'T ', and we can
    108. # parse everything after that.
    109. # start sequence end total_sequence_length
    110. patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
    111. groups = _get_hhr_line_regex_groups(patt, line[17:])
    112. start = int(groups[0]) - 1 # Make index zero based.
    113. delta_hit_sequence = groups[1]
    114. assert length_block == len(delta_hit_sequence)
    115. # Update the hit sequence and indices list.
    116. hit_sequence += delta_hit_sequence
    117. _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
    118. return TemplateHit(
    119. index=number_of_hit,
    120. name=name_hit,
    121. aligned_cols=int(aligned_cols),
    122. sum_probs=sum_probs,
    123. query=query,
    124. hit_sequence=hit_sequence,
    125. indices_query=indices_query,
    126. indices_hit=indices_hit,
    127. )
    128. def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
    129. """Parses the content of an entire HHR file."""
    130. lines = hhr_string.splitlines()
    131. # Each .hhr file starts with a results table, then has a sequence of hit
    132. # "paragraphs", each paragraph starting with a line 'No '. We
    133. # iterate through each paragraph to parse each hit.
    134. block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]
    135. hits = []
    136. if block_starts:
    137. block_starts.append(len(lines)) # Add the end of the final block.
    138. for i in range(len(block_starts) - 1):
    139. hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
    140. return hits
    141. ## query.hhr文件路径 path/to/hh-suite/data/query.hhr
    142. with open("query.hhr") as f:
    143. hhr_str = f.read()
    144. hits = parse_hhr(hhr_str)
    145. print(len(hits))
    146. print(hits)

    参考:

    python正则表达式

  • 相关阅读:
    无人驾驶相关硬件汇总
    Docker篇-(2)-Docker安装-centos
    ThreadLocal巨坑!内存泄露只是小儿科
    json入门教程+在java中的一些便捷操作
    redis 生成流水工具类
    项目实战第三十三讲:标准中心-属性体系
    零时科技 || Victor the Fortune攻击事件分析
    sqli-labs less9详解
    华钜同创:亚马逊开店有哪些注意事项
    #### 广告投放 ####
  • 原文地址:https://blog.csdn.net/qq_27390023/article/details/134325601