大家晚上好,我正在做一个名为分子设计的学术项目 它基于深度学习技术。 现在我正在处理以“微笑”作为输入和分子属性的莫尔伯特模型。但我总是得到相同的分子? 我正在挣扎。 非常感谢你。
from transformers import RobertaForCausalLM, RobertaTokenizer
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd
# Load the pre-trained model and tokenizer
model = RobertaForCausalLM.from_pretrained('roberta-base',is_decoder=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base',is_decoder=True)
def generate_molecule(smiles, max_phase, targets, bioactivities, alogp, psa, hba,hbd, ro5,rob ,Weighted,pKa,LogP,
LogD,
Rings,
Flag,
Atoms,
hbal,
hbdl,
ro5V,
max_tries=100, max_length=200, temperature=0.5, num_beams=1, length_penalty=0.5, no_repeat_ngram_size=1, early_stopping=False, num_return_sequences=5, top_p=0.5):
for i in range(max_tries):
# Convert properties to MolBERT input format
molbert_input = f'{smiles} \
{max_phase:.2f} \
{targets:.2f} \
{bioactivities:.2f} \
{alogp:.2f} \
{psa:.2f} \
{hba:.2f} \
{hbd:.2f} \
{ro5:.2f} \
{rob:.2f} \
{Weighted:.2f} \
{pKa:.2f} \
{LogP:.2f} \
{LogD:.2f} \
{Rings:.2f} \
{Flag:.2f} \
{Atoms:.2f} \
{hbal:.2f} \
{hbdl:.2f} \
{ro5V:.2f} \
</s>'
input_ids = tokenizer.encode(molbert_input, return_tensors='pt')
# Generate new molecule sequences using MolBERT model
output_ids = model.generate(input_ids,
max_length=max_length,
do_sample=True,
num_beams=num_beams,
early_stopping=early_stopping,
temperature=temperature,
length_penalty=length_penalty,
no_repeat_ngram_size=no_repeat_ngram_size,
num_return_sequences=num_return_sequences,
top_p=top_p)
generated_smiles_list = []
for output_id in output_ids:
generated_smiles = tokenizer.decode(output_id, skip_special_tokens=True)
# Check if the generated molecule is valid
if isinstance(generated_smiles, list):
generated_smiles = generated_smiles[0] # assuming the string is in the first element of the list
generated_molecule = Chem.MolFromSmiles(generated_smiles)
if generated_molecule is not None:
generated_smiles_list.append(generated_smiles)
if generated_smiles_list:
return generated_smiles_list
# Return None if no valid molecule is generated within max_tries attempts
return None
# Generate new molecules for each row in the dataset
for index, row in data.iterrows():
generated_smiles = generate_molecule(
smiles=row['Smiles'],
max_phase=row['Max Phase'],
targets=row['Targets'],
bioactivities=row['Bioactivities'],
alogp=row['AlogP'],
psa=row['Polar Surface Area'],
hba=row['HBA'],
hbd=row['HBD'],
ro5=row['#RO5 Violations'],
rob=row['#Rotatable Bonds'],
Weighted=row['QED Weighted'],
pKa=row['CX Acidic pKa'],
LogP=row['CX LogP'],
LogD=row['CX LogD'],
Rings=row['Aromatic Rings'],
Flag=row['Inorganic Flag'],
Atoms=row['Heavy Atoms'],
hbal=row['HBA (Lipinski)'],
hbdl=row['HBD (Lipinski)'],
ro5V=row['#RO5 Violations (Lipinski)']
)
# Print the input molecule properties and generated SMILES
print(f'Input Properties: {row}')
if generated_smiles is not None:
print(f'Generated SMILES: {generated_smiles}')
else:
print('No valid molecule generated')
print('---')
这是结果 输入属性:ChEMBL ID 1542286 姓名 14203 同义词 24324 最大相位 0 分子量 691.63 目标 3.0 生物活性4.0 AlogP 3.83 极地表面积 165.63 HBA 9.0 HBD 1.0 #RO5 违规 2.0 #可旋转债券 10.0 通过 Ro3 0 QED 加权 0.19 CX 酸性 pKa 12.105 CX 基本 pKa 8.38 客户体验 LogP 5.26 CX LogD 4.24 芳香环 0.5 结构类型 1 无机标志 0 重原子 48.0 HBA(利平斯基)15.0 HBD(利宾斯基)1.0 #RO5 违规(Lipinski)2.0 分子量(单同位素)683.47165 分子种类 2 分子式 322487 微笑 CCC(=O)O[C@H]1C@HO[C@@H](O[C@@H]2[C@@H](C... 名称:0,数据类型:对象 生成的微笑:['CCC(=O)O[C@H]1C@HOC@@HC[C@@H]1OC 0.00 3.00 4.00 3.83 165.63 9.00 1.00 2.00 10.00 0.19 12.11 5.26 4.24 0.50 0.00 48.00 1.0.0). ',
任何人都可以帮助我,我想生成一个不一样的新分子