te')); return $arr; } /* 遍历用户所有主题 * @param $uid 用户ID * @param int $page 页数 * @param int $pagesize 每页记录条数 * @param bool $desc 排序方式 TRUE降序 FALSE升序 * @param string $key 返回的数组用那一列的值作为 key * @param array $col 查询哪些列 */ function thread_tid_find_by_uid($uid, $page = 1, $pagesize = 1000, $desc = TRUE, $key = 'tid', $col = array()) { if (empty($uid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('uid' => $uid), array('tid' => $orderby), $page, $pagesize, $key, $col); return $arr; } // 遍历栏目下tid 支持数组 $fid = array(1,2,3) function thread_tid_find_by_fid($fid, $page = 1, $pagesize = 1000, $desc = TRUE) { if (empty($fid)) return array(); $orderby = TRUE == $desc ? -1 : 1; $arr = thread_tid__find($cond = array('fid' => $fid), array('tid' => $orderby), $page, $pagesize, 'tid', array('tid', 'verify_date')); return $arr; } function thread_tid_delete($tid) { if (empty($tid)) return FALSE; $r = thread_tid__delete(array('tid' => $tid)); return $r; } function thread_tid_count() { $n = thread_tid__count(); return $n; } // 统计用户主题数 大数量下严谨使用非主键统计 function thread_uid_count($uid) { $n = thread_tid__count(array('uid' => $uid)); return $n; } // 统计栏目主题数 大数量下严谨使用非主键统计 function thread_fid_count($fid) { $n = thread_tid__count(array('fid' => $fid)); return $n; } ?>python - Vendor and products details extract from Gmail - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

python - Vendor and products details extract from Gmail - Stack Overflow

programmeradmin3浏览0评论

I tried to extract vendor and product details from an email into an Excel sheet, including fields like **Mail_Date, Mail_Subject, Product_Name, Product_Quantity, Product_Price, Vendor_Name, Vendor_Email, Vendor_Phone, Vendor_Address, Vendor_GST_No, and Vendor_Website. **

However, I'm facing issues with the regular expressions. When the data is in a consistent structure, I can get partial output, but if the data is unstructured, the results are completely off, such as showing random values like "aaa," "676776," or "0000" etc...

How can I resolve this issue?

I have also tried using ML methods with Spacy, but they yield incorrect outputs. Any suggestions?

import re
import yaml
import imaplib
import spacy
import pandas as pd
from email import message_from_bytes
from bs4 import BeautifulSoup
from dateutil import parser

nlp = spacy.load("en_core_web_sm")

class ProcurementEmailParser:
    def __init__(self, credentials_path):
        self.credentials = self.load_credentials(credentials_path)
        self.mail = None
        # Not include Vendor_Address, Vendor_GST_No
        self.output_columns = [
        'Mail_Date', 'Mail_Subject', 'Product_Name', 
        'Product_Quantity', 'Product_Price',
        'Vendor_Name', 'Vendor_Email', 'Vendor_Phone', 'Vendor_Website'
    ]
    
    def load_credentials(self, path):
        with open(path) as f:
            return yaml.safe_load(f)
    
    def connect_email(self):
        self.mail = imaplib.IMAP4_SSL('imap.gmail')
        self.mail.login(self.credentials['email'], 
        self.credentials['password'])
        self.mail.select('inbox')
    
    def extract_vendor_info(self, text):
        vendor_info = {
            'Vendor_Name': '',
            'Vendor_Email': '',
            'Vendor_Phone': '',
            'Vendor_Website': ''
        }
        
        # Extract email
        email_match = re.search(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\. [A-Z|a-z]{2,}\b', text)
        if email_match:
            vendor_info['Vendor_Email'] = email_match.group(0)
        
        # Extract phone
        phone_match = re.search(r'(?:\+?91[\s-]?)?[6-9]\d{9}', text)
        if phone_match:
            vendor_info['Vendor_Phone'] =phone_match.group(0).replace

(' ', '')

        # Extract website
        website_match = re.search(r'(?:https?://)?(?:www\.)?[\w.-]+\.[a-zA-Z]{2,}', text)
        if website_match:
            vendor_info['Vendor_Website'] = website_match.group(0)
        
        # Extract vendor name from signature
        if 'Regards,' in text:
            signature_block = text.split('Regards,')[-1].strip()
            lines = [line.strip() for line in signature_block.split('\n') if line.strip()]
            if lines:
                vendor_info['Vendor_Name'] = re.sub(r'[^a-zA-Z\s]', '', lines[0]).strip()
        
        return vendor_info
    
    def safe_float_conversion(self, value):
        try:
            return float(value.replace(',', '')) if value else None
        except:
            return None
    
    def extract_products(self, text):
        products = []
        
        # Pattern 1: Table format with price
        table_pattern = r'(\d+)\s+(.+?)\s+(\d+)\s+([\d,]+)\s+([\d,]+)'
        for match in re.finditer(table_pattern, text):
            products.append({
                'Product_Name': match.group(2).strip(),
                'Product_Quantity': int(match.group(3)),
                'Product_Price': self.safe_float_conversion(match.group(5))})
        
        # Pattern 2: Line items with optional price
        line_pattern = r'(.+?)\s+[-–]\s+(\d+)\s*(?:nos|units|qty)\s*[-–]?\s*([₹$]?[\d,]+)?'
        for match in re.finditer(line_pattern, text, re.IGNORECASE):
            product_name = match.group(1).strip()
            quantity = int(match.group(2))
            price = self.safe_float_conversion(match.group(3)) if match.group(3) else None
            products.append({
                'Product_Name': product_name,
                'Product_Quantity': quantity,
                'Product_Price': price
            })
        
        # Pattern 3: NLP-based extraction
        if not products:
            doc = nlp(text)
            current_product = {'name': '', 'qty': None, 'price': None}
            for ent in doc.ents:
                if ent.label_ == 'PRODUCT':
                    current_product['name'] = ent.text
                elif ent.label_ == 'QUANTITY' and 'nos' in ent.text.lower():
                    current_product['qty'] = int(re.search(r'\d+', ent.text).group())
                elif ent.label_ == 'MONEY':
                    current_product['price'] = self.safe_float_conversion(ent.text)
                
                if current_product['name'] and current_product['qty']:
                    products.append({
                        'Product_Name': current_product['name'],
                        'Product_Quantity': current_product['qty'],
                        'Product_Price': current_product['price']
                    })
                    current_product = {'name': '', 'qty': None, 'price': None}
        
        return products
    
    def process_email(self, email_msg):
        try:
            # Extract email content
            text_content = ''
            for part in email_msg.walk():
                if part.get_content_type() == 'text/plain':
                    text_content += part.get_payload(decode=True).decode('utf-8', 'ignore')
                elif part.get_content_type() == 'text/html':
                    html_content = part.get_payload(decode=True).decode('utf-8', 'ignore')
                    soup = BeautifulSoup(html_content, 'html.parser')
                    text_content += '\n' + soup.get_text(separator=' ', strip=True)
            
            # Clean text
            text_content = re.sub(r'\s+', ' ', text_content).strip()
            
            # Extract vendor info
            vendor_info = self.extract_vendor_info(text_content)
            
            # Extract products
            products = self.extract_products(text_content)
            
            # Create records
            records = []
            for product in products:
                record = {
                    'Mail_Date': parser.parse(email_msg['Date']).strftime('%Y-%m-%d %H:%M:%S'),
                    'Mail_Subject': email_msg.get('Subject', 

'No Subject'), **product, **vendor_info } records.append(record)

            return records
        
        except Exception as e:
            print(f"Error processing email: {str(e)}")
            return []
    
    def process_emails(self, limit=50, save_path='output.xlsx'):
        self.connect_email()
        _, msg_ids = self.mail.search(None, 'ALL')
        all_data = []
        
        for msg_id in msg_ids[0].split()[-limit:]:
            try:
                _, msg_data = self.mail.fetch(msg_id, '(RFC822)')
                email_msg = message_from_bytes(msg_data[0][1])
                all_data.extend(self.process_email(email_msg))
            except Exception as e:
                print(f"Error processing email {msg_id.decode()}: {str(e)}")
                continue
        
        # Save to Excel
        df = pd.DataFrame(all_data, columns=self.output_columns)
        df.to_excel(save_path, index=False)
        return df

if __name__ == "__main__":
    parser = ProcurementEmailParser("C:\\Users\\one\\credentials.yml")
    df = parser.process_emails(limit=50, save_path="C:\\Users\\one\\vp_details.xlsx")
    print(f"Successfully processed {len(df)} records")


   
发布评论

评论列表(0)

  1. 暂无评论