hhsearch
是 HMM-HMM(Hidden Markov Model to Hidden Markov Model)比对方法的一部分,属于 HMMER 软件套件。它用于进行蛋白质序列的高效比对,特别适用于检测远缘同源性。
以下是 hhsearch
的一些主要特点和用途:
HMM-HMM比对: hhsearch
使用隐藏马尔可夫模型(HMM)来表示蛋白质家族的模型。与传统的序列-序列比对方法不同,HMM-HMM比对考虑了氨基酸残基的多序列信息,使得在比对中能够更好地捕捉蛋白质家族的模式和结构。
检测远缘同源性: hhsearch
的一个主要优势是其能够检测到相对远离的同源关系。它在比对中引入了更多的信息,从而提高了对远缘同源蛋白的发现能力。
灵敏度和特异性: hhsearch
的设计旨在在维持高灵敏度的同时,减少假阳性的比对。这使得它在寻找结构和功能相似性时更为可靠。
数据库搜索: 用户可以使用 hhsearch
在大型蛋白质数据库中搜索与给定蛋白质序列相似的蛋白质。
- """Library to run HHsearch from Python."""
-
- import glob
- import os
- import subprocess
- from typing import Sequence, Optional, List, Iterable
- from absl import logging
- import contextlib
- import tempfile
- import dataclasses
- import contextlib
- import time
- import shutil
- import re
-
-
- @contextlib.contextmanager
- def timing(msg: str):
- logging.info('Started %s', msg)
- tic = time.time()
- yield
- toc = time.time()
- logging.info('Finished %s in %.3f seconds', msg, toc - tic)
-
-
- @dataclasses.dataclass(frozen=True)
- class TemplateHit:
- """Class representing a template hit."""
- index: int
- name: str
- aligned_cols: int
- sum_probs: Optional[float]
- query: str
- hit_sequence: str
- indices_query: List[int]
- indices_hit: List[int]
-
-
- @contextlib.contextmanager
- def tmpdir_manager(base_dir: Optional[str] = None):
- """Context manager that deletes a temporary directory on exit."""
- tmpdir = tempfile.mkdtemp(dir=base_dir)
- try:
- yield tmpdir
- finally:
- shutil.rmtree(tmpdir, ignore_errors=True)
-
-
- def parse_hhr(hhr_string: str) -> Sequence[TemplateHit]:
- """Parses the content of an entire HHR file."""
- lines = hhr_string.splitlines()
-
- # Each .hhr file starts with a results table, then has a sequence of hit
- # "paragraphs", each paragraph starting with a line 'No
' . We - # iterate through each paragraph to parse each hit.
-
- block_starts = [i for i, line in enumerate(lines) if line.startswith('No ')]
-
- hits = []
- if block_starts:
- block_starts.append(len(lines)) # Add the end of the final block.
- for i in range(len(block_starts) - 1):
- hits.append(_parse_hhr_hit(lines[block_starts[i]:block_starts[i + 1]]))
- return hits
-
-
-
- def _parse_hhr_hit(detailed_lines: Sequence[str]) -> TemplateHit:
- """Parses the detailed HMM HMM comparison section for a single Hit.
- This works on .hhr files generated from both HHBlits and HHSearch.
- Args:
- detailed_lines: A list of lines from a single comparison section between 2
- sequences (which each have their own HMM's)
- Returns:
- A dictionary with the information from that detailed comparison section
- Raises:
- RuntimeError: If a certain line cannot be processed
- """
- # Parse first 2 lines.
- number_of_hit = int(detailed_lines[0].split()[-1])
- name_hit = detailed_lines[1][1:]
-
- # Parse the summary line.
- pattern = (
- 'Probab=(.*)[\t ]*E-value=(.*)[\t ]*Score=(.*)[\t ]*Aligned_cols=(.*)[\t'
- ' ]*Identities=(.*)%[\t ]*Similarity=(.*)[\t ]*Sum_probs=(.*)[\t '
- ']*Template_Neff=(.*)')
- match = re.match(pattern, detailed_lines[2])
- if match is None:
- raise RuntimeError(
- 'Could not parse section: %s. Expected this: \n%s to contain summary.' %
- (detailed_lines, detailed_lines[2]))
- (_, _, _, aligned_cols, _, _, sum_probs, _) = [float(x)
- for x in match.groups()]
-
- # The next section reads the detailed comparisons. These are in a 'human
- # readable' format which has a fixed length. The strategy employed is to
- # assume that each block starts with the query sequence line, and to parse
- # that with a regexp in order to deduce the fixed length used for that block.
- query = ''
- hit_sequence = ''
- indices_query = []
- indices_hit = []
- length_block = None
-
- for line in detailed_lines[3:]:
- # Parse the query sequence line
- if (line.startswith('Q ') and not line.startswith('Q ss_dssp') and
- not line.startswith('Q ss_pred') and
- not line.startswith('Q Consensus')):
- # Thus the first 17 characters must be 'Q
' , and we can parse - # everything after that.
- # start sequence end total_sequence_length
- patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*([0-9]*) \([0-9]*\)'
- groups = _get_hhr_line_regex_groups(patt, line[17:])
-
- # Get the length of the parsed block using the start and finish indices,
- # and ensure it is the same as the actual block length.
- start = int(groups[0]) - 1 # Make index zero based.
- delta_query = groups[1]
- end = int(groups[2])
- num_insertions = len([x for x in delta_query if x == '-'])
- length_block = end - start + num_insertions
- assert length_block == len(delta_query)
-
- # Update the query sequence and indices list.
- query += delta_query
- _update_hhr_residue_indices_list(delta_query, start, indices_query)
-
- elif line.startswith('T '):
- # Parse the hit sequence.
- if (not line.startswith('T ss_dssp') and
- not line.startswith('T ss_pred') and
- not line.startswith('T Consensus')):
- # Thus the first 17 characters must be 'T
' , and we can - # parse everything after that.
- # start sequence end total_sequence_length
- patt = r'[\t ]*([0-9]*) ([A-Z-]*)[\t ]*[0-9]* \([0-9]*\)'
- groups = _get_hhr_line_regex_groups(patt, line[17:])
- start = int(groups[0]) - 1 # Make index zero based.
- delta_hit_sequence = groups[1]
- assert length_block == len(delta_hit_sequence)
-
- # Update the hit sequence and indices list.
- hit_sequence += delta_hit_sequence
- _update_hhr_residue_indices_list(delta_hit_sequence, start, indices_hit)
-
- return TemplateHit(
- index=number_of_hit,
- name=name_hit,
- aligned_cols=int(aligned_cols),
- sum_probs=sum_probs,
- query=query,
- hit_sequence=hit_sequence,
- indices_query=indices_query,
- indices_hit=indices_hit,
- )
-
-
- def _get_hhr_line_regex_groups(
- regex_pattern: str, line: str) -> Sequence[Optional[str]]:
- match = re.match(regex_pattern, line)
- if match is None:
- raise RuntimeError(f'Could not parse query line {line}')
- return match.groups()
-
- def _update_hhr_residue_indices_list(
- sequence: str, start_index: int, indices_list: List[int]):
- """Computes the relative indices for each residue with respect to the original sequence."""
- counter = start_index
- for symbol in sequence:
- if symbol == '-':
- indices_list.append(-1)
- else:
- indices_list.append(counter)
- counter += 1
-
-
- class HHSearch:
- """Python wrapper of the HHsearch binary."""
-
- def __init__(self,
- *,
- binary_path: str,
- databases: Sequence[str],
- maxseq: int = 1_000_000):
- """Initializes the Python HHsearch wrapper.
- Args:
- binary_path: The path to the HHsearch executable.
- databases: A sequence of HHsearch database paths. This should be the
- common prefix for the database files (i.e. up to but not including
- _hhm.ffindex etc.)
- maxseq: The maximum number of rows in an input alignment. Note that this
- parameter is only supported in HHBlits version 3.1 and higher.
- Raises:
- RuntimeError: If HHsearch binary not found within the path.
- """
- self.binary_path = binary_path
- self.databases = databases
- self.maxseq = maxseq
-
-
-
- #for database_path in self.databases:
- # if not glob.glob(database_path + '_*'):
- # logging.error('Could not find HHsearch database %s', database_path)
- # raise ValueError(f'Could not find HHsearch database {database_path}')
-
-
-
- @property
- def output_format(self) -> str:
- return 'hhr'
-
- @property
- def input_format(self) -> str:
- return 'a3m'
-
- def query(self, a3m: str) -> str:
- """Queries the database using HHsearch using a given a3m."""
- with tmpdir_manager() as query_tmp_dir:
- input_path = os.path.join(query_tmp_dir, 'query.a3m')
- hhr_path = os.path.join(query_tmp_dir, 'output.hhr')
- with open(input_path, 'w') as f:
- f.write(a3m)
-
- db_cmd = []
- for db_path in self.databases:
- db_cmd.append('-d')
- db_cmd.append(db_path)
- cmd = [self.binary_path,
- '-i', input_path,
- '-o', hhr_path,
- '-maxseq', str(self.maxseq)
- ] + db_cmd
-
-
- print("cmd:",cmd)
-
-
- logging.info('Launching subprocess "%s"', ' '.join(cmd))
- process = subprocess.Popen(
- cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
- with timing('HHsearch query'):
- stdout, stderr = process.communicate()
- retcode = process.wait()
-
- if retcode:
- # Stderr is truncated to prevent proto size errors in Beam.
- raise RuntimeError(
- 'HHSearch failed:\nstdout:\n%s\n\nstderr:\n%s\n' % (
- stdout.decode('utf-8'), stderr[:100_000].decode('utf-8')))
-
- with open(hhr_path) as f:
- hhr = f.read()
- return hhr
-
- def get_template_hits(self,
- output_string: str,
- input_sequence: str) -> Sequence[TemplateHit]:
- """Gets parsed template hits from the raw string output by the tool."""
- del input_sequence # Used by hmmseach but not needed for hhsearch.
- return parse_hhr(output_string)
-
- def convert_stockholm_to_a3m (stockholm_format: str,
- max_sequences: Optional[int] = None,
- remove_first_row_gaps: bool = True) -> str:
- """Converts MSA in Stockholm format to the A3M format."""
- descriptions = {}
- sequences = {}
- reached_max_sequences = False
-
- for line in stockholm_format.splitlines():
- reached_max_sequences = max_sequences and len(sequences) >= max_sequences
- if line.strip() and not line.startswith(('#', '//')):
- # Ignore blank lines, markup and end symbols - remainder are alignment
- # sequence parts.
- seqname, aligned_seq = line.split(maxsplit=1)
- if seqname not in sequences:
- if reached_max_sequences:
- continue
- sequences[seqname] = ''
- sequences[seqname] += aligned_seq
-
- for line in stockholm_format.splitlines():
- if line[:4] == '#=GS':
- # Description row - example format is:
- # #=GS UniRef90_Q9H5Z4/4-78 DE [subseq from] cDNA: FLJ22755 ...
- columns = line.split(maxsplit=3)
- seqname, feature = columns[1:3]
- value = columns[3] if len(columns) == 4 else ''
- if feature != 'DE':
- continue
- if reached_max_sequences and seqname not in sequences:
- continue
- descriptions[seqname] = value
- if len(descriptions) == len(sequences):
- break
-
- # Convert sto format to a3m line by line
- a3m_sequences = {}
- if remove_first_row_gaps:
- # query_sequence is assumed to be the first sequence
- query_sequence = next(iter(sequences.values()))
- query_non_gaps = [res != '-' for res in query_sequence]
- for seqname, sto_sequence in sequences.items():
- # Dots are optional in a3m format and are commonly removed.
- out_sequence = sto_sequence.replace('.', '')
- if remove_first_row_gaps:
- out_sequence = ''.join(
- _convert_sto_seq_to_a3m(query_non_gaps, out_sequence))
- a3m_sequences[seqname] = out_sequence
-
- fasta_chunks = (f">{k} {descriptions.get(k, '')}\n{a3m_sequences[k]}"
- for k in a3m_sequences)
- return '\n'.join(fasta_chunks) + '\n' # Include terminating newline
-
-
- def _convert_sto_seq_to_a3m(
- query_non_gaps: Sequence[bool], sto_seq: str) -> Iterable[str]:
- for is_query_res_non_gap, sequence_res in zip(query_non_gaps, sto_seq):
- if is_query_res_non_gap:
- yield sequence_res
- elif sequence_res != '-':
- yield sequence_res.lower()
-
-
- if __name__ == "__main__":
-
- ### 1. 准备输入数据
- ## 输入序列先通过Jackhmmer多次迭代从uniref90,MGnify数据库搜索同源序列,输出的多序列比对文件(如globins4.sto),转化为a3m格式后,再通过hhsearch从pdb数据库中找到同源序列
- input_fasta_file = '/home/zheng/test/Q94K49.fasta'
- ## input_sequence
- with open(input_fasta_file) as f:
- input_sequence = f.read()
-
- test_templates_sto_file = "/home/zheng/test/Q94K49_aln.sto"
- with open(test_templates_sto_file) as f:
- test_templates_sto = f.read()
-
- ## sto格式转a3m格式()
- test_templates_a3m = convert_stockholm_to_a3m(test_templates_sto)
- hhsearch_binary_path = "/home/zheng/software/hhsuite-3.3.0-SSE2-Linux/bin/hhsearch"
- ### 2.类实例化
- # scop70_1.75文件名前缀
- scop70_database_path = "/home/zheng/database/scop70_1.75_hhsuite3/scop70_1.75"
- pdb70_database_path = "/home/zheng/database/pdb70_from_mmcif_latest/pdb70"
- #hhsuite数据库下载地址:https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
-
- ## 单一数据库
- #template_searcher = HHSearch(binary_path = hhsearch_binary_path,
- # databases = [scop70_database_path])
-
- ## 多个数据库
- database_lst = [scop70_database_path, pdb70_database_path]
- template_searcher = HHSearch(binary_path = hhsearch_binary_path,
- databases = database_lst)
-
- ### 3. 同源序列搜索
- ## 搜索结果返回.hhr文件字符串
- templates_result = template_searcher.query(test_templates_a3m)
- print(templates_result)
-
-
- ## pdb序列信息列表
- template_hits = template_searcher.get_template_hits(
- output_string=templates_result, input_sequence=input_sequence)
-
- print(template_hits)