hhblits 是 HMM-HMM(Hidden Markov Model to Hidden Markov Model)比对方法的一部分,也是 HMMER 软件套件中的工具之一。与 hhsearch 类似,hhblits 也用于进行高效的蛋白质序列比对,特别擅长于检测远缘同源性。
hh-suite 源码 https://github.com/soedinglab/hh-suite
以下是 hhblits 的一些主要特点和用途:
HMM-HMM比对: hhblits 使用隐藏马尔可夫模型(HMM)来表示蛋白质家族的模型,从而能够更全面地考虑多序列信息。这使得它相对于传统的序列-序列比对方法更能捕捉蛋白质家族的模式和结构。
迭代比对: hhblits 支持迭代比对的策略,通过在每一轮中更新模型,逐渐提高比对的灵敏度。这种策略对于发现相对远缘同源蛋白质非常有用。
远缘同源性检测: 与其他 HMM-HMM比对方法类似,hhblits 被设计用于检测远缘同源蛋白质,即那些在序列上相对较远但在结构和功能上相似的蛋白质。
数据库搜索: 用户可以使用 hhblits 在大型蛋白质数据库中搜索与给定蛋白质序列相似的蛋白质。
灵敏度和特异性: hhblits 的设计目标是在保持高灵敏度的同时,降低假阳性的比对,以提高比对的特异性。
- import glob
- import os
- import subprocess
- from typing import Any, List, Mapping, Optional, Sequence
-
- from absl import logging
- import contextlib
- import shutil
- import tempfile
- import time
-
- _HHBLITS_DEFAULT_P = 20
- _HHBLITS_DEFAULT_Z = 500
-
-
- @contextlib.contextmanager
- def tmpdir_manager(base_dir: Optional[str] = None):
- """Context manager that deletes a temporary directory on exit."""
- tmpdir = tempfile.mkdtemp(dir=base_dir)
- try:
- yield tmpdir
- finally:
- shutil.rmtree(tmpdir, ignore_errors=True)
-
-
- @contextlib.contextmanager
- def timing(msg: str):
- logging.info('Started %s', msg)
- tic = time.time()
- yield
- toc = time.time()
- logging.info('Finished %s in %.3f seconds', msg, toc - tic)
-
-
- class HHBlits:
- """Python wrapper of the HHblits binary."""
-
- def __init__(self,
- *,
- binary_path: str,
- databases: Sequence[str],
- n_cpu: int = 4,
- n_iter: int = 3,
- e_value: float = 0.001,
- maxseq: int = 1_000_000,
- realign_max: int = 100_000,
- maxfilt: int = 100_000,
- min_prefilter_hits: int = 1000,
- all_seqs: bool = False,
- alt: Optional[int] = None,
- p: int = _HHBLITS_DEFAULT_P,
- z: int = _HHBLITS_DEFAULT_Z):
- """Initializes the Python HHblits wrapper.
- Args:
- binary_path: The path to the HHblits executable.
- databases: A sequence of HHblits database paths. This should be the
- common prefix for the database files (i.e. up to but not including
- _hhm.ffindex etc.)
- n_cpu: The number of CPUs to give HHblits.
- n_iter: The number of HHblits iterations.
- e_value: The E-value, see HHblits docs for more details.
- maxseq: The maximum number of rows in an input alignment. Note that this
- parameter is only supported in HHBlits version 3.1 and higher.
- realign_max: Max number of HMM-HMM hits to realign. HHblits default: 500.
- maxfilt: Max number of hits allowed to pass the 2nd prefilter.
- HHblits default: 20000.
- min_prefilter_hits: Min number of hits to pass prefilter.
- HHblits default: 100.
- all_seqs: Return all sequences in the MSA / Do not filter the result MSA.
- HHblits default: False.
- alt: Show up to this many alternative alignments.
- p: Minimum Prob for a hit to be included in the output hhr file.
- HHblits default: 20.
- z: Hard cap on number of hits reported in the hhr file.
- HHblits default: 500. NB: The relevant HHblits flag is -Z not -z.
- Raises:
- RuntimeError: If HHblits binary not found within the path.
- """
- self.binary_path = binary_path
- self.databases = databases
-
- self.n_cpu = n_cpu
- self.n_iter = n_iter
- self.e_value = e_value
- self.maxseq = maxseq
- self.realign_max = realign_max
- self.maxfilt = maxfilt
- self.min_prefilter_hits = min_prefilter_hits
- self.all_seqs = all_seqs
- self.alt = alt
- self.p = p
- self.z = z
-
- def query(self, input_fasta_path: str) -> List[Mapping[str, Any]]:
- """Queries the database using HHblits."""
- with tmpdir_manager() as query_tmp_dir:
- a3m_path = os.path.join(query_tmp_dir, 'output.a3m')
-
- db_cmd = []
- for db_path in self.databases:
- db_cmd.append('-d')
- db_cmd.append(db_path)
- cmd = [
- self.binary_path,
- '-i', input_fasta_path,
- '-cpu', str(self.n_cpu),
- '-oa3m', a3m_path,
- '-o', '/dev/null',
- '-n', str(self.n_iter),
- '-e', str(self.e_value),
- '-maxseq', str(self.maxseq),
- '-realign_max', str(self.realign_max),
- '-maxfilt', str(self.maxfilt),
- '-min_prefilter_hits', str(self.min_prefilter_hits)]
- if self.all_seqs:
- cmd += ['-all']
- if self.alt:
- cmd += ['-alt', str(self.alt)]
- if self.p != _HHBLITS_DEFAULT_P:
- cmd += ['-p', str(self.p)]
- if self.z != _HHBLITS_DEFAULT_Z:
- cmd += ['-Z', str(self.z)]
- cmd += db_cmd
-
- print("cmd:",' '.join(cmd))
-
- logging.info('Launching subprocess "%s"', ' '.join(cmd))
- process = subprocess.Popen(
- cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-
- with timing('HHblits query'):
- stdout, stderr = process.communicate()
- retcode = process.wait()
-
- if retcode:
- # Logs have a 15k character limit, so log HHblits error line by line.
- logging.error('HHblits failed. HHblits stderr begin:')
- for error_line in stderr.decode('utf-8').splitlines():
- if error_line.strip():
- logging.error(error_line.strip())
- logging.error('HHblits stderr end')
- raise RuntimeError('HHblits failed\nstdout:\n%s\n\nstderr:\n%s\n' % (
- stdout.decode('utf-8'), stderr[:500_000].decode('utf-8')))
-
- with open(a3m_path) as f:
- a3m = f.read()
-
- raw_output = dict(
- a3m=a3m,
- output=stdout,
- stderr=stderr,
- n_iter=self.n_iter,
- e_value=self.e_value)
- return [raw_output]
-
-
- if __name__ == "__main__":
- hhblits_binary_path = "/home/zheng/software/hhsuite-3.3.0-SSE2-Linux/bin/hhblits"
- input_fasta_path = "/home/zheng/test/Q94K49.fasta"
-
- #hhsuite数据库下载地址:https://wwwuser.gwdg.de/~compbiol/data/hhsuite/databases/hhsuite_dbs/
- scop70_database_path = "/home/zheng/database/scop70_1.75_hhsuite3/scop70_1.75"
- pdb70_database_path = "/home/zheng/database/pdb70_from_mmcif_latest/pdb70"
-
- ## 单一数据库
- #hhblits_runner = HHBlits(binary_path = hhblits_binary_path,
- # databases = [scop70_database_path])
-
- ## 多个数据库
- database_lst = [scop70_database_path, pdb70_database_path]
- hhblits_runner = HHBlits(binary_path = hhblits_binary_path,
- databases = database_lst)
-
-
- result = hhblits_runner.query(input_fasta_path)
- print(len(result))
- print(result[0]['a3m'])
- #print(result)