- from rdkit import Chem
- from rdkit.Chem import BRICS
- import numpy as np
-
-
- # --------------------- 将分子拆分为不带数字或者自定义 ---------------------
- def fragment_recursive(mol, frags):
- try:
- bonds = list(BRICS.FindBRICSBonds(mol))
- if len(bonds) == 0:
- frags.append(Chem.MolToSmiles(mol))
- return frags
-
- idxs, labs = list(zip(*bonds))
- bond_idxs = []
- for a1, a2 in idxs:
- bond = mol.GetBondBetweenAtoms(a1, a2)
- bond_idxs.append(bond.GetIdx())
- order = np.argsort(bond_idxs).tolist()
- bond_idxs = [bond_idxs[i] for i in order]
- broken = Chem.FragmentOnBonds(mol, bondIndices=[bond_idxs[0]], dummyLabels=[(0, 0)])
- head, tail = Chem.GetMolFrags(broken, asMols=True)
- frags.append(Chem.MolToSmiles(head))
- return fragment_recursive(tail, frags)
- except Exception as e:
- print(e)
- pass
-
-
- # --------------------- 将*号去掉 ---------------------
- def remove_dummy(smiles):
- try:
- stripped_smi=smiles.replace('*','[H]')
- mol=Chem.MolFromSmiles(stripped_smi)
- return Chem.MolToSmiles(mol)
- except Exception as e:
- print(e)
- return None
- # 单个smiles拆分为fragment
- aspirin = Chem.MolFromSmiles('C1CC1C(=O)N2CCN(CC2)C(=O)C3=C(C=CC(=C3)CC4=NNC(=O)C5=CC=CC=C54)F')
- fragments = fragment_recursive(aspirin, [])
- clean_fragments = [remove_dummy(smi) for smi in fragments]
- print(clean_fragments)
- # csv文件的每一行smiles拆分为fragment + 删除重复的行
- import csv
- import pandas as pd
-
- # 读取原始 CSV 文件
- with open('csv/double.csv', 'r') as csv_file:
- reader = csv.reader(csv_file)
- input_data = list(reader)
-
- # 处理每一行数据并将结果保存到列表中
- output_data = []
- for row in input_data:
- try:
- aspirin = Chem.MolFromSmiles(row[0])
- fragments = fragment_recursive(aspirin, [])
- clean_fragments = [remove_dummy(smi) for smi in fragments]
- output_data.append(clean_fragments)
- except:
- print("Current smiles process Error: ", row)
-
- # 存入新的csv文件 + delete duplicate item
- new_csv_name = 'output.smi'
- df = pd.DataFrame(output_data)
- df = df.stack().reset_index(drop=True)
- df.to_csv(new_csv_name, index=False, header=False)
- df = pd.read_csv(new_csv_name, header=None)
- df.drop_duplicates(inplace=True)
- df.to_csv(new_csv_name, index=False, header=False)
rdkit的BRICS算法,BRICS基于常见的反应,选择片段断键的位点,提供了化学合成意义上的可行性。
- from rdkit.Chem import Recap
- from rdkit.Chem import AllChem as Chem
-
- m = Chem.MolFromSmiles('c1ccccc1OCCOC(=O)CC')
- hierarch = Recap.RecapDecompose(m)
-
-
- #叶子节点函数:hierarch.GetLeaves()
- print(hierarch.GetLeaves().keys())
-
- #子孙节点函数:hierarch.GetAllChildren()
- print(hierarch.GetAllChildren().keys())
-
- #祖先节点函数,返回列表:getUltimateParents()
- print(hierarch.getUltimateParents()[0].smiles)
-