python - Issue with Recognizing Catechol and Dimethoxybenzene Patterns Using RDKit

I am working on a Python program using RDKit to compare molecules and detect specific substructures, particularly catechol and dimethoxybenzene patterns. However, I am having trouble correctly recognizing these patterns, especially for the Piceatannol molecule.

Context:

I am using RDKit to generate molecular fingerprints and compare molecules. I am trying to detect the following patterns: Catechol: Two hydroxyl groups (OH) on a benzene ring (ortho positions). Dimethoxybenzene: Two methoxy groups (OCH3) on a benzene ring. Problem:

Piceatannol (SMILES: OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O) is not being recognized as containing a catechol or dimethoxybenzene pattern, even though its structure seems to match these patterns.

Current Code:

Here is my code:

from rdkit import __version__
print('\n\nRDKit Version : ', __version__,'\n\n')

from rdkit import Chem
from rdkit.Chem import Descriptors, rdFingerprintGenerator, DataStructs

# Définir la molécule de référence (Picéatannol)
piceatannol_smiles = "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O"
piceatannol_mol = Chem.MolFromSmiles(piceatannol_smiles)

# Vérifier que la molécule de référence est valide
if piceatannol_mol is None:
    raise ValueError("❌ SMILES du Picéatannol invalide")

# Générer l'empreinte Man
man_gen = rdFingerprintGenerator.GetManGenerator(radius=2, fpSize=2048, includeChirality=True)
piceatannol_fp = man_gen.GetFingerprint(piceatannol_mol)

# Définir les sous-structures avec SMARTS améliorés
dimethoxybenzene_pattern = Chem.MolFromSmarts("COc1ccccc1OC")  # Diméthoxybenzène
catechol_pattern = Chem.MolFromSmarts("[OH]c1ccccc1[OH]")  # Catéchol strict
flexible_catechol_pattern = Chem.MolFromSmarts("c1ccccc1[OH]c2ccccc2[OH]")  # Catéchol flexible
piceatannol_specific_pattern = Chem.MolFromSmarts(piceatannol_smiles)  # Motif spécifique au Picéatannol

# Liste des molécules avec leurs SMILES
# molecules = {
#     "Cianidanol": "C1=CC(=C(C=C1C2C(C(C(O2)O)O)O)O)",
#     "Dopamine": "NCCc1ccc(O)c(O)c1",
#     "Tolcapone": "CC(=O)NC1=CC(=C(C=C1)O)C2=CC(=C(C=C2)O)C(=O)O",
#     "Droxidopa": "C1=CC(=C(C=C1C(C(C(=O)O)N)O)O)",
#     "Isoprenaline": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Levonordefrin": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Norépinéphrine": "NCCc1ccc(O)c(O)c1",
#     "Arbutamine": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Méthyldopa": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Paroxétine": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Tadalafil": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Épinéphrine": "NCCc1ccc(O)c(O)c1",
#     "Micafungine": "NCCc1ccc(O)c(O)c1",
#     "Orciprénaline": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Carbidopa": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Rutine": "C1=CC(=C(C=C1C2C(C(C(O2)O)O)O)O)",
#     "Fluorodopa": "NCCc1ccc(O)c(O)c1",
#     "Céfiderocol": "NCCc1ccc(O)c(O)c1",
#     "Éthinylestradiol": "NCCc1ccc(O)c(O)c1",
#     "Phénytoïne": "NCCc1ccc(O)c(O)c1",
#     "Opicapone": "NCCc1ccc(O)c(O)c1",
#     "Isoétharine": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Entacapone": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Métyrosine": "CC(C)NCCc1ccc(O)c(O)c1",
#     "Lévodopa": "NCCc1ccc(O)c(O)c1",
#     "Stiripentol": "NCCc1ccc(O)c(O)c1",
#     "Duloxétine": "CC(C)NCCc1ccc(O)c(O)c1",
#     "2-Hydroxyestradiol": "NCCc1ccc(O)c(O)c1",
#     "Tyrosinase": "NCCc1ccc(O)c(O)c1",
#     "1,2-Dihydroxybenzène": "C1=CC(=C(C=C1O)O)O",
#     "3-Méthyl-benzène-1,2-diol": "CC1=CC(=C(C=C1O)O)O",
#     "4-Méthyl-1,2-Benzènediol": "CC1=CC(=C(C=C1O)O)O",
#     "3,4-Biphényldiol": "C1=CC(=C(C=C1C2=CC=CC=C2)O)O",
#     "Picéatannol": "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O",  # SMILES du Picéatannol
#     "Protocatéchualdéhyde": "C1=CC(=C(C=C1C=O)O)O",
#     "2'-Chloro-Biphényl-2,3-Diol": "C1=CC(=C(C=C1C2=CC(=C(C=C2)Cl)O)O)",
#     "Hydroxytyrosol": "C1=CC(=C(C=C1C(CO)O)O)O",
#     "4-(3-éthylthiophén-2-yl)benzène-1,2-diol": "CC1=CSC(=C1)C2=CC(=C(C=C2)O)O",
#     "Masoprocol": "C1=CC(=C(C=C1C2=CC(=C(C=C2)O)O)O)O",
#     "5-(3,3-Dihydroxypropeny)-3-Méthoxy-Benzène-1,2-Diol": "C1=CC(=C(C=C1C=CC(CO)O)O)O",
#     "Didox": "C1=CC(=C(C=C1C2=CC(=C(C=C2)O)O)O)",
#     "4-Nitrocatechol": "C1=CC(=C(C=C1[N+](=O)[O-])O)O"
# }


# Liste des molécules avec leurs SMILES
molecules = {
    "Piceatannol" : "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O"
}


# Stocker les résultats
results = []

for name, smiles in molecules.items():
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"❌ Erreur avec la molécule : {name} (SMILES invalide)")
        continue

    # Vérifier la présence de diméthoxybenzène, catéchol strict, catéchol flexible ou motif spécifique au Picéatannol
    has_dimethoxybenzene = mol.HasSubstructMatch(dimethoxybenzene_pattern)
    has_catechol_strict = mol.HasSubstructMatch(catechol_pattern)
    has_catechol_flexible = mol.HasSubstructMatch(flexible_catechol_pattern)
    has_piceatannol_specific = mol.HasSubstructMatch(piceatannol_specific_pattern)

    print(f"

科技改变生活-雨落星辰 - 所有的伟大,都源于一个勇敢的开始

python - Issue with Recognizing Catechol and Dimethoxybenzene Patterns Using RDKit - Stack Overflow

`与本文相关的文章`

`评论列表(0)`