I am working on a Python program using RDKit to compare molecules and detect specific substructures, particularly catechol and dimethoxybenzene patterns. However, I am having trouble correctly recognizing these patterns, especially for the Piceatannol molecule.
Context:
I am using RDKit to generate molecular fingerprints and compare molecules. I am trying to detect the following patterns: Catechol: Two hydroxyl groups (OH) on a benzene ring (ortho positions). Dimethoxybenzene: Two methoxy groups (OCH3) on a benzene ring. Problem:
Piceatannol (SMILES: OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O) is not being recognized as containing a catechol or dimethoxybenzene pattern, even though its structure seems to match these patterns.
Current Code:
Here is my code:
from rdkit import __version__
print('\n\nRDKit Version : ', __version__,'\n\n')
from rdkit import Chem
from rdkit.Chem import Descriptors, rdFingerprintGenerator, DataStructs
# Définir la molécule de référence (Picéatannol)
piceatannol_smiles = "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O"
piceatannol_mol = Chem.MolFromSmiles(piceatannol_smiles)
# Vérifier que la molécule de référence est valide
if piceatannol_mol is None:
raise ValueError("❌ SMILES du Picéatannol invalide")
# Générer l'empreinte Man
man_gen = rdFingerprintGenerator.GetManGenerator(radius=2, fpSize=2048, includeChirality=True)
piceatannol_fp = man_gen.GetFingerprint(piceatannol_mol)
# Définir les sous-structures avec SMARTS améliorés
dimethoxybenzene_pattern = Chem.MolFromSmarts("COc1ccccc1OC") # Diméthoxybenzène
catechol_pattern = Chem.MolFromSmarts("[OH]c1ccccc1[OH]") # Catéchol strict
flexible_catechol_pattern = Chem.MolFromSmarts("c1ccccc1[OH]c2ccccc2[OH]") # Catéchol flexible
piceatannol_specific_pattern = Chem.MolFromSmarts(piceatannol_smiles) # Motif spécifique au Picéatannol
# Liste des molécules avec leurs SMILES
# molecules = {
# "Cianidanol": "C1=CC(=C(C=C1C2C(C(C(O2)O)O)O)O)",
# "Dopamine": "NCCc1ccc(O)c(O)c1",
# "Tolcapone": "CC(=O)NC1=CC(=C(C=C1)O)C2=CC(=C(C=C2)O)C(=O)O",
# "Droxidopa": "C1=CC(=C(C=C1C(C(C(=O)O)N)O)O)",
# "Isoprenaline": "CC(C)NCCc1ccc(O)c(O)c1",
# "Levonordefrin": "CC(C)NCCc1ccc(O)c(O)c1",
# "Norépinéphrine": "NCCc1ccc(O)c(O)c1",
# "Arbutamine": "CC(C)NCCc1ccc(O)c(O)c1",
# "Méthyldopa": "CC(C)NCCc1ccc(O)c(O)c1",
# "Paroxétine": "CC(C)NCCc1ccc(O)c(O)c1",
# "Tadalafil": "CC(C)NCCc1ccc(O)c(O)c1",
# "Épinéphrine": "NCCc1ccc(O)c(O)c1",
# "Micafungine": "NCCc1ccc(O)c(O)c1",
# "Orciprénaline": "CC(C)NCCc1ccc(O)c(O)c1",
# "Carbidopa": "CC(C)NCCc1ccc(O)c(O)c1",
# "Rutine": "C1=CC(=C(C=C1C2C(C(C(O2)O)O)O)O)",
# "Fluorodopa": "NCCc1ccc(O)c(O)c1",
# "Céfiderocol": "NCCc1ccc(O)c(O)c1",
# "Éthinylestradiol": "NCCc1ccc(O)c(O)c1",
# "Phénytoïne": "NCCc1ccc(O)c(O)c1",
# "Opicapone": "NCCc1ccc(O)c(O)c1",
# "Isoétharine": "CC(C)NCCc1ccc(O)c(O)c1",
# "Entacapone": "CC(C)NCCc1ccc(O)c(O)c1",
# "Métyrosine": "CC(C)NCCc1ccc(O)c(O)c1",
# "Lévodopa": "NCCc1ccc(O)c(O)c1",
# "Stiripentol": "NCCc1ccc(O)c(O)c1",
# "Duloxétine": "CC(C)NCCc1ccc(O)c(O)c1",
# "2-Hydroxyestradiol": "NCCc1ccc(O)c(O)c1",
# "Tyrosinase": "NCCc1ccc(O)c(O)c1",
# "1,2-Dihydroxybenzène": "C1=CC(=C(C=C1O)O)O",
# "3-Méthyl-benzène-1,2-diol": "CC1=CC(=C(C=C1O)O)O",
# "4-Méthyl-1,2-Benzènediol": "CC1=CC(=C(C=C1O)O)O",
# "3,4-Biphényldiol": "C1=CC(=C(C=C1C2=CC=CC=C2)O)O",
# "Picéatannol": "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O", # SMILES du Picéatannol
# "Protocatéchualdéhyde": "C1=CC(=C(C=C1C=O)O)O",
# "2'-Chloro-Biphényl-2,3-Diol": "C1=CC(=C(C=C1C2=CC(=C(C=C2)Cl)O)O)",
# "Hydroxytyrosol": "C1=CC(=C(C=C1C(CO)O)O)O",
# "4-(3-éthylthiophén-2-yl)benzène-1,2-diol": "CC1=CSC(=C1)C2=CC(=C(C=C2)O)O",
# "Masoprocol": "C1=CC(=C(C=C1C2=CC(=C(C=C2)O)O)O)O",
# "5-(3,3-Dihydroxypropeny)-3-Méthoxy-Benzène-1,2-Diol": "C1=CC(=C(C=C1C=CC(CO)O)O)O",
# "Didox": "C1=CC(=C(C=C1C2=CC(=C(C=C2)O)O)O)",
# "4-Nitrocatechol": "C1=CC(=C(C=C1[N+](=O)[O-])O)O"
# }
# Liste des molécules avec leurs SMILES
molecules = {
"Piceatannol" : "OC1=CC=C(C=C1)C=C(C2=CC(O)=CC(O)=C2)O"
}
# Stocker les résultats
results = []
for name, smiles in molecules.items():
mol = Chem.MolFromSmiles(smiles)
if mol is None:
print(f"❌ Erreur avec la molécule : {name} (SMILES invalide)")
continue
# Vérifier la présence de diméthoxybenzène, catéchol strict, catéchol flexible ou motif spécifique au Picéatannol
has_dimethoxybenzene = mol.HasSubstructMatch(dimethoxybenzene_pattern)
has_catechol_strict = mol.HasSubstructMatch(catechol_pattern)
has_catechol_flexible = mol.HasSubstructMatch(flexible_catechol_pattern)
has_piceatannol_specific = mol.HasSubstructMatch(piceatannol_specific_pattern)
print(f"