ter = array_value($config, 'filter'); $arr = array_value($filter, $type); $enable = array_value($arr, 'enable'); $wordarr = array_value($arr, 'keyword'); if (0 == $enable || empty($wordarr)) return FALSE; foreach ($wordarr as $_keyword) { if (!$_keyword) continue; $r = strpos(strtolower($keyword), strtolower($_keyword)); if (FALSE !== $r) { $error = $_keyword; return TRUE; } } return FALSE; } // return http://domain.com OR https://domain.com function url_prefix() { $http = ((isset($_SERVER['HTTPS']) && 'on' == $_SERVER['HTTPS']) || (isset($_SERVER['HTTP_X_FORWARDED_PROTO']) && $_SERVER['HTTP_X_FORWARDED_PROTO'] == 'https')) ? 'https://' : 'http://'; return $http . $_SERVER['HTTP_HOST']; } // 唯一身份ID function uniq_id() { return uniqid(substr(md5(microtime(true) . mt_rand(1000, 9999)), 8, 8)); } // 生成订单号 14位 function trade_no() { $trade_no = str_replace('.', '', microtime(1)); $strlen = mb_strlen($trade_no, 'UTF-8'); $strlen = 14 - $strlen; $str = ''; if ($strlen) { for ($i = 0; $i <= $strlen; $i++) { if ($i < $strlen) $str .= '0'; } } return $trade_no . $str; } // 生成订单号 16位 function trade_no_16() { $explode = explode(' ', microtime()); $trade_no = $explode[1] . mb_substr($explode[0], 2, 6, 'UTF-8'); return $trade_no; } // 当前年的天数 function date_year($time = NULL) { $time = intval($time) ? $time : time(); return date('L', $time) + 365; } // 当前年份中的第几天 function date_z($time = NULL) { $time = intval($time) ? $time : time(); return date('z', $time); } // 当前月份中的第几天,没有前导零 1 到 31 function date_j($time = NULL) { $time = intval($time) ? $time : time(); return date('j', $time); } // 当前月份中的第几天,有前导零的2位数字 01 到 31 function date_d($time = NULL) { $time = intval($time) ? $time : time(); return date('d', $time); } // 当前时间为星期中的第几天 数字表示 1表示星期一 到 7表示星期天 function date_w_n($time = NULL) { $time = intval($time) ? $time : time(); return date('N', $time); } // 当前日第几周 function date_d_w($time = NULL) { $time = intval($time) ? $time : time(); return date('W', $time); } // 当前几月 没有前导零1-12 function date_n($time = NULL) { $time = intval($time) ? $time : time(); return date('n', $time); } // 当前月的天数 function date_t($time = NULL) { $time = intval($time) ? $time : time(); return date('t', $time); } // 0 o'clock on the day function clock_zero() { return strtotime(date('Ymd')); } // 24 o'clock on the day function clock_twenty_four() { return strtotime(date('Ymd')) + 86400; } // 8点过期 / expired at 8 a.m. function eight_expired($time = NULL) { $time = intval($time) ? $time : time(); // 当前时间大于8点则改为第二天8点过期 $life = date('G') <= 8 ? (strtotime(date('Ymd')) + 28800 - $time) : clock_twenty_four() - $time + 28800; return $life; } // 24点过期 / expired at 24 a.m. function twenty_four_expired($time = NULL) { $time = intval($time) ? $time : time(); $twenty_four = clock_twenty_four(); $life = $twenty_four - $time; return $life; } /** * @param $url 提交地址 * @param string $post POST数组 / 空为GET获取数据 / $post='GET'获取连续跳转最终URL * @param string $cookie cookie * @param int $timeout 超时 * @param int $ms 设为1是毫秒 * @return mixed 返回数据 */ function https_request($url, $post = '', $cookie = '', $timeout = 30, $ms = 0) { if (empty($url)) return FALSE; if (version_compare(PHP_VERSION, '5.2.3', '<')) { $ms = 0; $timeout = 30; } is_array($post) and $post = http_build_query($post); // 没有安装curl 使用http的形式,支持post if (!extension_loaded('curl')) { //throw new Exception('server not install CURL'); if ($post) { return https_post($url, $post, $cookie, $timeout); } else { return http_get($url, $cookie, $timeout); } } is_array($cookie) and $cookie = http_build_query($cookie); $curl = curl_init(); // 返回执行结果,不输出 curl_setopt($curl, CURLOPT_RETURNTRANSFER, true); //php5.5跟php5.6中的CURLOPT_SAFE_UPLOAD的默认值不同 if (class_exists('\CURLFile')) { curl_setopt($curl, CURLOPT_SAFE_UPLOAD, true); } else { defined('CURLOPT_SAFE_UPLOAD') and curl_setopt($curl, CURLOPT_SAFE_UPLOAD, false); } // 设定请求的RUL curl_setopt($curl, CURLOPT_URL, $url); // 设定返回信息中包含响应信息头 if (ini_get('safe_mode') && ini_get('open_basedir')) { // $post参数必须为GET if ('GET' == $post) { // 安全模式时将头文件的信息作为数据流输出 curl_setopt($curl, CURLOPT_HEADER, true); // 安全模式采用连续抓取 curl_setopt($curl, CURLOPT_NOBODY, true); } } else { curl_setopt($curl, CURLOPT_HEADER, false); // 允许跳转10次 curl_setopt($curl, CURLOPT_MAXREDIRS, 10); // 使用自动跳转,返回最后的Location curl_setopt($curl, CURLOPT_FOLLOWLOCATION, true); } $ua1 = 'Mozilla/5.0 (iPhone; CPU iPhone OS 13_2_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/13.0.3 Mobile/15E148 Safari/604.1'; $ua = empty($_SERVER["HTTP_USER_AGENT"]) ? $ua1 : $_SERVER["HTTP_USER_AGENT"]; curl_setopt($curl, CURLOPT_USERAGENT, $ua); // 兼容HTTPS if (FALSE !== stripos($url, 'https://')) { curl_setopt($curl, CURLOPT_SSL_VERIFYPEER, FALSE); curl_setopt($curl, CURLOPT_SSL_VERIFYHOST, FALSE); //ssl版本控制 //curl_setopt($curl, CURLOPT_SSLVERSION, CURL_SSLVERSION_TLSv1); curl_setopt($curl, CURLOPT_SSLVERSION, true); } $header = array('Content-type: application/x-www-form-urlencoded;charset=UTF-8', 'X-Requested-With: XMLHttpRequest'); $cookie and $header[] = "Cookie: $cookie"; curl_setopt($curl, CURLOPT_HTTPHEADER, $header); if ($post) { // POST curl_setopt($curl, CURLOPT_POST, true); // 自动设置Referer curl_setopt($curl, CURLOPT_AUTOREFERER, true); curl_setopt($curl, CURLOPT_POSTFIELDS, $post); } if ($ms) { curl_setopt($curl, CURLOPT_NOSIGNAL, true); // 设置毫秒超时 curl_setopt($curl, CURLOPT_TIMEOUT_MS, intval($timeout)); // 超时毫秒 } else { curl_setopt($curl, CURLOPT_TIMEOUT, intval($timeout)); // 秒超时 } //优先解析 IPv6 超时后IPv4 //curl_setopt($curl, CURLOPT_IPRESOLVE, CURL_IPRESOLVE_V4); curl_setopt($curl, CURLOPT_ENCODING, 'gzip'); // 返回执行结果 $output = curl_exec($curl); // 有效URL,输出URL非URL页面内容 CURLOPT_RETURNTRANSFER 必须为false 'GET' == $post and $output = curl_getinfo($curl, CURLINFO_EFFECTIVE_URL); curl_close($curl); return $output; } function save_image($img) { $ch = curl_init(); // 设定请求的RUL curl_setopt($ch, CURLOPT_URL, $img); // 设定返回信息中包含响应信息头 启用时会将头文件的信息作为数据流输出 //curl_setopt($ch, CURLOPT_HEADER, false); //curl_setopt($ch, CURLOPT_USERAGENT, $_SERVER["HTTP_USER_AGENT"]); // true表示$html,false表示echo $html curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 10); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, false); curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, false); //curl_setopt($ch, CURLOPT_BINARYTRANSFER, 1); //curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 0); curl_setopt($ch, CURLOPT_ENCODING, 'gzip'); $output = curl_exec($ch); curl_close($ch); return $output; } // 计算字串宽度:剧中对齐(字体大小/字串内容/字体链接/背景宽度/倍数) function calculate_str_width($size, $str, $font, $width, $multiple = 2) { $box = imagettfbbox($size, 0, $font, $str); return ($width - $box[4] - $box[6]) / $multiple; } // 搜索目录下的文件 比对文件后缀 function search_directory($path) { if (is_dir($path)) { $paths = scandir($path); foreach ($paths as $val) { $sub_path = $path . '/' . $val; if ('.' == $val || '..' == $val) { continue; } else if (is_dir($sub_path)) { //echo '目录名:' . $val . '
'; search_directory($sub_path); } else { //echo ' 最底层文件: ' . $path . '/' . $val . '
'; $ext = strtolower(file_ext($sub_path)); if (in_array($ext, array('php', 'asp', 'jsp', 'cgi', 'exe', 'dll'), TRUE)) { echo '异常文件:' . $sub_path . '
'; } } } } } // 一维数组转字符串 $sign待签名字符串 $url为urlencode转码GET参数字符串 function array_to_string($arr, &$sign = '', &$url = '') { if (count($arr) != count($arr, 1)) throw new Exception('Does not support multi-dimensional array to string'); // 注销签名 unset($arr['sign']); // 排序 ksort($arr); reset($arr); // 转字符串做签名 $url = ''; $sign = ''; foreach ($arr as $key => $val) { if (empty($val) || is_array($val)) continue; $url .= $key . '=' . urlencode($val) . '&'; $sign .= $key . '=' . $val . '&'; } $url = substr($url, 0, -1); $url = htmlspecialchars($url); $sign = substr($sign, 0, -1); } // 私钥生成签名 function rsa_create_sign($data, $key, $sign_type = 'RSA') { if (!function_exists('openssl_sign')) throw new Exception('OpenSSL extension is not enabled'); if (!defined('OPENSSL_ALGO_SHA256')) throw new Exception('Only versions above PHP 5.4.8 support SHA256'); $key = wordwrap($key, 64, "\n", true); if (FALSE === $key) throw new Exception('Private Key Error'); $key = "-----BEGIN RSA PRIVATE KEY-----\n$key\n-----END RSA PRIVATE KEY-----"; if ('RSA2' == $sign_type) { openssl_sign($data, $sign, $key, OPENSSL_ALGO_SHA256); } else { openssl_sign($data, $sign, $key, OPENSSL_ALGO_SHA1); } // 加密 return base64_encode($sign); } // 公钥验证签名 function rsa_verify_sign($data, $sign, $key, $sign_type = 'RSA') { $key = wordwrap($key, 64, "\n", true); if (FALSE === $key) throw new Exception('Public Key Error'); $key = "-----BEGIN PUBLIC KEY-----\n$key\n-----END PUBLIC KEY-----"; // 签名正确返回1 签名不正确返回0 错误-1 if ('RSA2' == $sign_type) { $result = openssl_verify($data, base64_decode($sign), $key, OPENSSL_ALGO_SHA256); } else { $result = openssl_verify($data, base64_decode($sign), $key, OPENSSL_ALGO_SHA1); } return $result === 1; } // Array to xml array('appid' => 'appid', 'code' => 'success') function array_to_xml($arr) { if (!is_array($arr) || empty($arr)) throw new Exception('Array Error'); $xml = ""; foreach ($arr as $key => $val) { if (is_numeric($val)) { $xml .= "<" . $key . ">" . $val . ""; } else { $xml .= "<" . $key . ">"; } } $xml .= ""; return $xml; } // Xml to array function xml_to_array($xml) { if (!$xml) throw new Exception('XML error'); $old = libxml_disable_entity_loader(true); // xml解析 $result = (array)simplexml_load_string($xml, null, LIBXML_NOCDATA | LIBXML_COMPACT); // 恢复旧值 if (FALSE === $old) libxml_disable_entity_loader(false); return $result; } // 逐行读取 function well_import($file) { if ($handle = fopen($file, 'r')) { while (!feof($handle)) { yield trim(fgets($handle)); } fclose($handle); } } // 计算总行数 function well_import_total($file, $key = 'well_import_total') { static $cache = array(); if (isset($cache[$key])) return $cache[$key]; $count = cache_get($key); if (NULL === $count) { $count = 0; $globs = well_import($file); while ($globs->valid()) { ++$count; $globs->next(); // 指向下一个 } $count and cache_set($key, $count, 300); } return $cache[$key] = $count; } $g_dir_file = FALSE; function well_search_dir($path) { global $g_dir_file; FALSE === $g_dir_file and $g_dir_file = array(); if (is_dir($path)) { $paths = scandir($path); foreach ($paths as $val) { $sub_path = $path . '/' . $val; if ('.' == $val || '..' == $val) { continue; } else if (is_dir($sub_path)) { well_search_dir($sub_path); } else { $g_dir_file[] = $sub_path; } } } return $g_dir_file; } ?>vba - Open PDF&#39;s as Word docs and add sections as cells in excel - Stack Overflow
最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

vba - Open PDF&#39;s as Word docs and add sections as cells in excel - Stack Overflow

programmeradmin0浏览0评论

I have a requirement that takes place on a frequent enough basis and takes days to complete. I have an excel document that needs to be updated with data from PDF files. They are just basic text with no major formatting other than a toc and Headings. The sections I want to pull out Have Heading 1 that match the first and second column in the Excel file, Then they are in bold arial 12 pt: font followed by a colon in the pdf/word doc. That is the only place in the document that exists. I need everything between that until the next Heading 1 / bold Arail 12 pt: So, everything between that goes in the matching row column in excel.

It will basically look Like this:

    Stuff - page 1
    TOC
    Header page
    First page
    Heading 1 - (AB) First Heading
    
    First section in bold arial 12: 
    
    Stuff between
    usually a numbered or bulleted list
    
     1. List item
     2. List item2
    
    Next Heading1 (CD) Second heading
    
    second section in bold arial 12: 
    
    Stuff between
    usually a numbered or bulleted list
    

 -  List item
 -  List item2

then the next column as so forth until the next match to heading 1 in the PDF/word doc. The names of the worksheets match the first 2 letters of or worksheet, which, in turn match the 2 letters between the () in the name of the file. I'm looping through all the *.pdf files in a dir chosen from a bff dialog. Here is my code so far:

Sub ExtractFromPDF()
    Dim folderPath As String
    Dim fileName As String
    Dim wdApp As Object, wdDoc As Object
    Dim xlWS As Worksheet
    Dim header As String, extractedText As String
    Dim cell As Range
    Dim sectionA As String, sectionB As String
    Dim sheet As Worksheet
    Dim fileCode As String
    Dim missingPDFs As String

    missingPDFs = ""
    Debug.Print "Starting ExtractFromPDF routine."

    ' Browse for folder path containing PDFs
    folderPath = BrowseForFolder("Select PDF Directory", "C:\temp\pdffiles")
    If folderPath = "" Then Exit Sub
    Debug.Print "Selected folder: " & folderPath

    ' Initialize Word Application
    On Error Resume Next
    Set wdApp = GetObject(, "Word.Application")
    If wdApp Is Nothing Then Set wdApp = CreateObject("Word.Application")
    On Error GoTo ErrorHandler

    If wdApp Is Nothing Then
        MsgBox "Microsoft Word is not installed or accessible.", vbCritical
        Exit Sub
    End If

    ' Allow breaking out of the loop
    Application.EnableCancelKey = xlErrorHandler

    ' Loop through PDF files
    fileName = Dir(folderPath & "\*.pdf")

    Do While fileName <> ""
        Debug.Print "Processing file: " & fileName
        Set xlWS = Nothing

        ' Extract the two letters within parentheses from the file name
        fileCode = ""
        If InStr(fileName, "(") > 0 And InStr(fileName, ")") > 0 Then
            fileCode = Mid(fileName, InStr(fileName, "(") + 1, 2)
            Debug.Print "Extracted file code: " & fileCode
        Else
            missingPDFs = missingPDFs & "Invalid filename format: " & fileName & vbCrLf
            Debug.Print "Invalid filename format: " & fileName
            GoTo NextFile
        End If

        ' Find the matching worksheet by first two letters
        For Each sheet In ThisWorkbook.Sheets
            If UCase(Left(sheet.Name, 2)) = UCase(fileCode) Then
                Set xlWS = sheet
                Debug.Print "Matched worksheet: " & sheet.Name
                Exit For
            End If
        Next sheet

        If xlWS Is Nothing Then
            missingPDFs = missingPDFs & "No matching worksheet for: " & fileName & vbCrLf
            Debug.Print "No matching worksheet for: " & fileName
            GoTo NextFile
        End If

        ' Open PDF with Word
        Set wdDoc = wdApp.Documents.Open(folderPath & "\" & fileName, ReadOnly:=True)

        ' Ensure Word document opened successfully
        If wdDoc Is Nothing Then
            missingPDFs = missingPDFs & "Failed to open in Word: " & fileName & vbCrLf
            Debug.Print "Failed to open Word document: " & fileName
            GoTo NextFile
        End If

        ' Process Word document content
        extractedText = ""

        Dim para As Object
        For Each para In wdDoc.Paragraphs
            Dim rng As Object
            Set rng = para.Range

            ' Check for Bold Arial 12 followed by colon
            If rng.Font.Bold = True And rng.Font.Name = "Arial" And rng.Font.Size = 12 Then
                If InStr(rng.Text, ":") > 0 Then
                    header = Trim(Split(rng.Text, ":")(0))
                    extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1))
                    Debug.Print "Extracted header: " & header
                    Debug.Print "Extracted text: " & extractedText

                    ' Parse section A and B from header
                    If InStr(header, " ") > 0 Then
                        sectionA = Trim(Split(header, " ")(0))
                        sectionB = Trim(Mid(header, InStr(header, " ") + 1))
                    Else
                        sectionA = header
                        sectionB = ""
                    End If

                    ' Match section A + space + B in Excel
                    Dim rowNum As Long
                    rowNum = 1
                    Do While xlWS.Cells(rowNum, 1).Value <> ""
                        Dim combinedAB As String
                        combinedAB = Trim(xlWS.Cells(rowNum, 1).Value) & " " & Trim(xlWS.Cells(rowNum, 1).Value)

                        ' Print current content being matched
                        Debug.Print "Matching PDF: " & extractedText & " with Excel: " & combinedAB

                        If NormalizeText(combinedAB) = NormalizeText(sectionA & " " & sectionB) Then
                            ' Compare and display if different (ignoring punctuation, spaces, and line breaks)
                            If NormalizeText(xlWS.Cells(rowNum, 3).Value) <> NormalizeText(extractedText) Then
                                Debug.Print "Discrepancy in " & xlWS.Cells(rowNum, 3).Address & " (Sheet: " & xlWS.Name & "): " & extractedText
                            End If
                            Exit Do
                        End If
                        rowNum = rowNum + 1
                    Loop
                End If
            End If
        Next para

        ' Close Word Document
        wdDoc.Close False

NextFile:
        ' Next PDF
        fileName = Dir
    Loop

    ' Report missing PDFs
    If missingPDFs <> "" Then
        MsgBox "Issues encountered: " & vbCrLf & missingPDFs, vbExclamation
    End If

    ' Cleanup
    If Not wdApp Is Nothing Then wdApp.Quit
    Set wdApp = Nothing
    MsgBox "Processing Complete!"
    Debug.Print "Processing complete."

ExitSub:
    If Not wdApp Is Nothing Then wdApp.Quit
    Set wdApp = Nothing
    MsgBox "Process Interrupted!"
    Debug.Print "Process interrupted."
    Exit Sub

ErrorHandler:
    Debug.Print "Error encountered: " & Err.Number & " - " & Err.Description
    If Err.Number = 18 Then Resume ExitSub
    MsgBox "Error: " & Err.Description
    Resume Next
End Sub

Function NormalizeText(ByVal txt As String) As String
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")

    regex.Pattern = "[^a-zA-Z0-9()\-]"
    regex.Global = True
    txt = regex.Replace(txt, "")

    NormalizeText = txt
End Function


Function BrowseForFolder(prompt As String, Optional defaultPath As String = "") As String
    Dim shellApp As Object
    Set shellApp = CreateObject("Shell.Application")
    Dim folder As Object

    ' Debug: Check if the defaultPath is passed and if it exists
    Debug.Print "Initial defaultPath: " & defaultPath
    
    ' Check if the defaultPath is valid and exists
    If defaultPath <> "" Then
        ' Ensure the path exists
        If Dir(defaultPath, vbDirectory) = "" Then
            MsgBox "The specified default path does not exist: " & defaultPath, vbExclamation
            defaultPath = "" ' Reset to empty if invalid path
        Else
            ' Change drive and directory if path exists
            On Error Resume Next
            ChDrive Left(defaultPath, 1)
            ChDir defaultPath
            On Error GoTo 0
        End If
    End If

    ' Show folder browse dialog
    Set folder = shellApp.BrowseForFolder(0, prompt, 0)

    ' If a folder is selected, return the path
    If Not folder Is Nothing Then
        BrowseForFolder = folder.Self.Path
        Debug.Print "Folder selected: " & BrowseForFolder
    Else
        ' If no folder is selected, return an empty string
        BrowseForFolder = ""
        Debug.Print "No folder selected."
    End If
End Function


Sub ClearImmediateWindow()
    On Error GoTo ErrorHandler

    ' Attempt to reset the Immediate window
    Application.VBE.CommandBars("Immediate").Reset
    Exit Sub

ErrorHandler:
    ' Handle specific error here
    MsgBox "Error occurred: " & Err.Description, vbCritical
End Sub

The problem I'm having is, the extractedpath and header variables don't appear to be getting set. the debug.prints don't ever show in the immediate window. I just get

Matching PDF: 
 with Excel: Column1value Column2value

over and over again going through all the Columns buy not matching to the section in the word document.

Does anyone see anything I'm missing or a better way to do this? I know it's a bit complicated, so if any clarifications need to be made, feel free to ask in comments.

I have a requirement that takes place on a frequent enough basis and takes days to complete. I have an excel document that needs to be updated with data from PDF files. They are just basic text with no major formatting other than a toc and Headings. The sections I want to pull out Have Heading 1 that match the first and second column in the Excel file, Then they are in bold arial 12 pt: font followed by a colon in the pdf/word doc. That is the only place in the document that exists. I need everything between that until the next Heading 1 / bold Arail 12 pt: So, everything between that goes in the matching row column in excel.

It will basically look Like this:

    Stuff - page 1
    TOC
    Header page
    First page
    Heading 1 - (AB) First Heading
    
    First section in bold arial 12: 
    
    Stuff between
    usually a numbered or bulleted list
    
     1. List item
     2. List item2
    
    Next Heading1 (CD) Second heading
    
    second section in bold arial 12: 
    
    Stuff between
    usually a numbered or bulleted list
    

 -  List item
 -  List item2

then the next column as so forth until the next match to heading 1 in the PDF/word doc. The names of the worksheets match the first 2 letters of or worksheet, which, in turn match the 2 letters between the () in the name of the file. I'm looping through all the *.pdf files in a dir chosen from a bff dialog. Here is my code so far:

Sub ExtractFromPDF()
    Dim folderPath As String
    Dim fileName As String
    Dim wdApp As Object, wdDoc As Object
    Dim xlWS As Worksheet
    Dim header As String, extractedText As String
    Dim cell As Range
    Dim sectionA As String, sectionB As String
    Dim sheet As Worksheet
    Dim fileCode As String
    Dim missingPDFs As String

    missingPDFs = ""
    Debug.Print "Starting ExtractFromPDF routine."

    ' Browse for folder path containing PDFs
    folderPath = BrowseForFolder("Select PDF Directory", "C:\temp\pdffiles")
    If folderPath = "" Then Exit Sub
    Debug.Print "Selected folder: " & folderPath

    ' Initialize Word Application
    On Error Resume Next
    Set wdApp = GetObject(, "Word.Application")
    If wdApp Is Nothing Then Set wdApp = CreateObject("Word.Application")
    On Error GoTo ErrorHandler

    If wdApp Is Nothing Then
        MsgBox "Microsoft Word is not installed or accessible.", vbCritical
        Exit Sub
    End If

    ' Allow breaking out of the loop
    Application.EnableCancelKey = xlErrorHandler

    ' Loop through PDF files
    fileName = Dir(folderPath & "\*.pdf")

    Do While fileName <> ""
        Debug.Print "Processing file: " & fileName
        Set xlWS = Nothing

        ' Extract the two letters within parentheses from the file name
        fileCode = ""
        If InStr(fileName, "(") > 0 And InStr(fileName, ")") > 0 Then
            fileCode = Mid(fileName, InStr(fileName, "(") + 1, 2)
            Debug.Print "Extracted file code: " & fileCode
        Else
            missingPDFs = missingPDFs & "Invalid filename format: " & fileName & vbCrLf
            Debug.Print "Invalid filename format: " & fileName
            GoTo NextFile
        End If

        ' Find the matching worksheet by first two letters
        For Each sheet In ThisWorkbook.Sheets
            If UCase(Left(sheet.Name, 2)) = UCase(fileCode) Then
                Set xlWS = sheet
                Debug.Print "Matched worksheet: " & sheet.Name
                Exit For
            End If
        Next sheet

        If xlWS Is Nothing Then
            missingPDFs = missingPDFs & "No matching worksheet for: " & fileName & vbCrLf
            Debug.Print "No matching worksheet for: " & fileName
            GoTo NextFile
        End If

        ' Open PDF with Word
        Set wdDoc = wdApp.Documents.Open(folderPath & "\" & fileName, ReadOnly:=True)

        ' Ensure Word document opened successfully
        If wdDoc Is Nothing Then
            missingPDFs = missingPDFs & "Failed to open in Word: " & fileName & vbCrLf
            Debug.Print "Failed to open Word document: " & fileName
            GoTo NextFile
        End If

        ' Process Word document content
        extractedText = ""

        Dim para As Object
        For Each para In wdDoc.Paragraphs
            Dim rng As Object
            Set rng = para.Range

            ' Check for Bold Arial 12 followed by colon
            If rng.Font.Bold = True And rng.Font.Name = "Arial" And rng.Font.Size = 12 Then
                If InStr(rng.Text, ":") > 0 Then
                    header = Trim(Split(rng.Text, ":")(0))
                    extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1))
                    Debug.Print "Extracted header: " & header
                    Debug.Print "Extracted text: " & extractedText

                    ' Parse section A and B from header
                    If InStr(header, " ") > 0 Then
                        sectionA = Trim(Split(header, " ")(0))
                        sectionB = Trim(Mid(header, InStr(header, " ") + 1))
                    Else
                        sectionA = header
                        sectionB = ""
                    End If

                    ' Match section A + space + B in Excel
                    Dim rowNum As Long
                    rowNum = 1
                    Do While xlWS.Cells(rowNum, 1).Value <> ""
                        Dim combinedAB As String
                        combinedAB = Trim(xlWS.Cells(rowNum, 1).Value) & " " & Trim(xlWS.Cells(rowNum, 1).Value)

                        ' Print current content being matched
                        Debug.Print "Matching PDF: " & extractedText & " with Excel: " & combinedAB

                        If NormalizeText(combinedAB) = NormalizeText(sectionA & " " & sectionB) Then
                            ' Compare and display if different (ignoring punctuation, spaces, and line breaks)
                            If NormalizeText(xlWS.Cells(rowNum, 3).Value) <> NormalizeText(extractedText) Then
                                Debug.Print "Discrepancy in " & xlWS.Cells(rowNum, 3).Address & " (Sheet: " & xlWS.Name & "): " & extractedText
                            End If
                            Exit Do
                        End If
                        rowNum = rowNum + 1
                    Loop
                End If
            End If
        Next para

        ' Close Word Document
        wdDoc.Close False

NextFile:
        ' Next PDF
        fileName = Dir
    Loop

    ' Report missing PDFs
    If missingPDFs <> "" Then
        MsgBox "Issues encountered: " & vbCrLf & missingPDFs, vbExclamation
    End If

    ' Cleanup
    If Not wdApp Is Nothing Then wdApp.Quit
    Set wdApp = Nothing
    MsgBox "Processing Complete!"
    Debug.Print "Processing complete."

ExitSub:
    If Not wdApp Is Nothing Then wdApp.Quit
    Set wdApp = Nothing
    MsgBox "Process Interrupted!"
    Debug.Print "Process interrupted."
    Exit Sub

ErrorHandler:
    Debug.Print "Error encountered: " & Err.Number & " - " & Err.Description
    If Err.Number = 18 Then Resume ExitSub
    MsgBox "Error: " & Err.Description
    Resume Next
End Sub

Function NormalizeText(ByVal txt As String) As String
    Dim regex As Object
    Set regex = CreateObject("VBScript.RegExp")

    regex.Pattern = "[^a-zA-Z0-9()\-]"
    regex.Global = True
    txt = regex.Replace(txt, "")

    NormalizeText = txt
End Function


Function BrowseForFolder(prompt As String, Optional defaultPath As String = "") As String
    Dim shellApp As Object
    Set shellApp = CreateObject("Shell.Application")
    Dim folder As Object

    ' Debug: Check if the defaultPath is passed and if it exists
    Debug.Print "Initial defaultPath: " & defaultPath
    
    ' Check if the defaultPath is valid and exists
    If defaultPath <> "" Then
        ' Ensure the path exists
        If Dir(defaultPath, vbDirectory) = "" Then
            MsgBox "The specified default path does not exist: " & defaultPath, vbExclamation
            defaultPath = "" ' Reset to empty if invalid path
        Else
            ' Change drive and directory if path exists
            On Error Resume Next
            ChDrive Left(defaultPath, 1)
            ChDir defaultPath
            On Error GoTo 0
        End If
    End If

    ' Show folder browse dialog
    Set folder = shellApp.BrowseForFolder(0, prompt, 0)

    ' If a folder is selected, return the path
    If Not folder Is Nothing Then
        BrowseForFolder = folder.Self.Path
        Debug.Print "Folder selected: " & BrowseForFolder
    Else
        ' If no folder is selected, return an empty string
        BrowseForFolder = ""
        Debug.Print "No folder selected."
    End If
End Function


Sub ClearImmediateWindow()
    On Error GoTo ErrorHandler

    ' Attempt to reset the Immediate window
    Application.VBE.CommandBars("Immediate").Reset
    Exit Sub

ErrorHandler:
    ' Handle specific error here
    MsgBox "Error occurred: " & Err.Description, vbCritical
End Sub

The problem I'm having is, the extractedpath and header variables don't appear to be getting set. the debug.prints don't ever show in the immediate window. I just get

Matching PDF: 
 with Excel: Column1value Column2value

over and over again going through all the Columns buy not matching to the section in the word document.

Does anyone see anything I'm missing or a better way to do this? I know it's a bit complicated, so if any clarifications need to be made, feel free to ask in comments.

Share Improve this question edited Mar 28 at 16:25 Tim Williams 167k8 gold badges100 silver badges141 bronze badges asked Mar 28 at 15:27 Matt WilliamsonMatt Williamson 7,1191 gold badge25 silver badges38 bronze badges 4
  • Would be good if you could limit the code you show to only the relevant parts - no one needs to see how you select the folder, how you open word, what you do with "invalid files" or the code of the error handler. Makes it really hard to find the right piece of code. Anyhow, as no one has the PDFs you have, I have no clue how we could help. Have you debugged the code? Does it find any range with Arial 12 at all? – FunThomas Commented Mar 28 at 16:22
  • You need to step through your code using the debugger and see where the execution goes. extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1)) The paragraph ends with ":" though, so what text are you trying to get here? You're really interested in the text after the header paragraph. It would be useful if you could share an example Word doc (suitably redacted) for folks to test with. – Tim Williams Commented Mar 28 at 16:38
  • 1 Instead of crawling through the paragraphs use the Find function to go straight to the heading. Search for the style Heading 1 as you have stated that is what is used. You can then use a predefined bookmark to return all the text for that heading. – Timothy Rylatt Commented Mar 28 at 17:04
  • @K J Thanks, that helps to sort things a bit. – Matt Williamson Commented Mar 30 at 18:46
Add a comment  | 

1 Answer 1

Reset to default 1

If the converted content really does include content in the Heading1 Style, all you need is some simple Word VBA code like:

Sub GetHeadingSpanText()
Dim RngHd As Range, strOut As String
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Style = wdStyleHeading1
    .Replacement.Text = ""
    .Wrap = wdFindStop
    .Forward = True
    .Format = True
    .Text = ""
  End With
  Do While .Find.Execute
    Set RngHd = .Paragraphs(1).Range
    Set RngHd = RngHd.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
    RngHd.Start = RngHd.Paragraphs.First.Range.End: strOut = RngHd.Text: MsgBox strOut
    .Collapse wdCollapseEnd
  Loop
End With
Set RngHd = Nothing
End Sub

If your 12pt Arial Bold content isn't actually a Heading Style, all you need is:

Sub GetHeadingSpanText()
Dim RngHd As Range, strOut As String
With ActiveDocument.Range
  With .Find
    .ClearFormatting
    .Replacement.ClearFormatting
    .Replacement.Text = ""
    .Text = ""
    .Wrap = wdFindContinue
    .Forward = True
    .Format = True
    With .Font
      .Name = "Arial"
      .Size = 12
      .Bold = 12
    End With
    .Replacement.Style = wdStyleHeading1
    .Execute Replace:=wdReplaceAll
    .ClearFormatting
    .Replacement.ClearFormatting
    .Style = wdStyleHeading1
    .Wrap = wdFindStop
  End With
  Do While .Find.Execute
    Set RngHd = .Paragraphs(1).Range
    Set RngHd = RngHd.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
    RngHd.Start = RngHd.Paragraphs.First.Range.End: strOut = RngHd.Text: MsgBox strOut
    .Collapse wdCollapseEnd
  Loop
End With
Set RngHd = Nothing
End Sub

I'll leave you to incorporate the Word code into your existing process.

发布评论

评论列表(0)

  1. 暂无评论