如何使用 molbert 获得新分子

问题描述 投票:0回答:0

大家晚上好,我正在做一个名为分子设计的学术项目 它基于深度学习技术。 现在我正在处理以“微笑”作为输入和分子属性的莫尔伯特模型。但我总是得到相同的分子? 我正在挣扎。 非常感谢你。

from transformers import RobertaForCausalLM, RobertaTokenizer
from rdkit import Chem
from rdkit.Chem import Descriptors
import pandas as pd


# Load the pre-trained model and tokenizer
model = RobertaForCausalLM.from_pretrained('roberta-base',is_decoder=True)
tokenizer = RobertaTokenizer.from_pretrained('roberta-base',is_decoder=True)

def generate_molecule(smiles, max_phase, targets, bioactivities, alogp, psa, hba,hbd, ro5,rob ,Weighted,pKa,LogP,
        LogD,
        Rings,
        Flag,
        Atoms,
        hbal,
        hbdl, 
        ro5V,
        max_tries=100, max_length=200, temperature=0.5, num_beams=1, length_penalty=0.5, no_repeat_ngram_size=1, early_stopping=False, num_return_sequences=5, top_p=0.5):
    for i in range(max_tries):
        # Convert properties to MolBERT input format
        molbert_input = f'{smiles} \
{max_phase:.2f} \
{targets:.2f} \
{bioactivities:.2f} \
{alogp:.2f} \
{psa:.2f} \
{hba:.2f} \
{hbd:.2f} \
{ro5:.2f} \
{rob:.2f} \
{Weighted:.2f} \
{pKa:.2f} \
{LogP:.2f} \
{LogD:.2f} \
{Rings:.2f} \
{Flag:.2f} \
{Atoms:.2f} \
{hbal:.2f} \
{hbdl:.2f} \
{ro5V:.2f} \
</s>'

        input_ids = tokenizer.encode(molbert_input, return_tensors='pt')

        # Generate new molecule sequences using MolBERT model
        output_ids = model.generate(input_ids, 
                                     max_length=max_length, 
                                     do_sample=True, 
                                     num_beams=num_beams, 
                                     early_stopping=early_stopping, 
                                     temperature=temperature, 
                                     length_penalty=length_penalty,
                                     no_repeat_ngram_size=no_repeat_ngram_size, 
                                     num_return_sequences=num_return_sequences,
                                     top_p=top_p)
        generated_smiles_list = []
        for output_id in output_ids:
            generated_smiles = tokenizer.decode(output_id, skip_special_tokens=True)

            # Check if the generated molecule is valid
            if isinstance(generated_smiles, list):
                generated_smiles = generated_smiles[0] # assuming the string is in the first element of the list

            generated_molecule = Chem.MolFromSmiles(generated_smiles)
            if generated_molecule is not None:
                generated_smiles_list.append(generated_smiles)

        if generated_smiles_list:
            return generated_smiles_list

    # Return None if no valid molecule is generated within max_tries attempts
    return None



# Generate new molecules for each row in the dataset
for index, row in data.iterrows():
    generated_smiles = generate_molecule(
        smiles=row['Smiles'],
        max_phase=row['Max Phase'],
        targets=row['Targets'],
        bioactivities=row['Bioactivities'],
        alogp=row['AlogP'],
        psa=row['Polar Surface Area'],
        hba=row['HBA'],
        hbd=row['HBD'], 
        ro5=row['#RO5 Violations'],
        rob=row['#Rotatable Bonds'], 
        Weighted=row['QED Weighted'],  
        pKa=row['CX Acidic pKa'],
        LogP=row['CX LogP'],
        LogD=row['CX LogD'],
        Rings=row['Aromatic Rings'],
        Flag=row['Inorganic Flag'],
        Atoms=row['Heavy Atoms'],
        hbal=row['HBA (Lipinski)'],
        hbdl=row['HBD (Lipinski)'], 
        ro5V=row['#RO5 Violations (Lipinski)']
    )

    # Print the input molecule properties and generated SMILES
    print(f'Input Properties: {row}')
    if generated_smiles is not None:
        print(f'Generated SMILES: {generated_smiles}')
    else:
        print('No valid molecule generated')
    print('---')

这是结果 输入属性:ChEMBL ID 1542286 姓名 14203 同义词 24324 最大相位 0 分子量 691.63 目标 3.0 生物活性4.0 AlogP 3.83 极地表面积 165.63 HBA 9.0 HBD 1.0 #RO5 违规 2.0 #可旋转债券 10.0 通过 Ro3 0 QED 加权 0.19 CX 酸性 pKa 12.105 CX 基本 pKa 8.38 客户体验 LogP 5.26 CX LogD 4.24 芳香环 0.5 结构类型 1 无机标志 0 重原子 48.0 HBA(利平斯基)15.0 HBD(利宾斯基)1.0 #RO5 违规(Lipinski)2.0 分子量(单同位素)683.47165 分子种类 2 分子式 322487 微笑 CCC(=O)O[C@H]1C@HO[C@@H](O[C@@H]2[C@@H](C... 名称:0,数据类型:对象 生成的微笑:['CCC(=O)O[C@H]1C@HOC@@HC[C@@H]1OC 0.00 3.00 4.00 3.83 165.63 9.00 1.00 2.00 10.00 0.19 12.11 5.26 4.24 0.50 0.00 48.00 1.0.0). ',

任何人都可以帮助我,我想生成一个不一样的新分子

python deep-learning data-science
© www.soinside.com 2019 - 2024. All rights reserved.