分子SMILES 字符串,将其转换为词汇数据,并将其转换为表示恒定字符串长度的张量。
- import torch
- import pandas as pd
- from collections import Counter
-
- class Dataset(torch.utils.data.Dataset):
- def __init__(self, url, smiles_col, sequence_length=4):
- self.url = url
- self.smiles_col = smiles_col
- self.sequence_length = sequence_length
- self.smiles = []
- self.words = self.load_words()
- self.uniq_words = self.get_uniq_words()
- self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
- self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}
- self.words_indexes = [self.word_to_index[w] for w in sel