I have a requirement that takes place on a frequent enough basis and takes days to complete. I have an excel document that needs to be updated with data from PDF files. They are just basic text with no major formatting other than a toc and Headings. The sections I want to pull out Have Heading 1 that match the first and second column in the Excel file, Then they are in bold arial 12 pt: font followed by a colon in the pdf/word doc. That is the only place in the document that exists. I need everything between that until the next Heading 1 / bold Arail 12 pt: So, everything between that goes in the matching row column in excel.
It will basically look Like this:
Stuff - page 1
TOC
Header page
First page
Heading 1 - (AB) First Heading
First section in bold arial 12:
Stuff between
usually a numbered or bulleted list
1. List item
2. List item2
Next Heading1 (CD) Second heading
second section in bold arial 12:
Stuff between
usually a numbered or bulleted list
- List item
- List item2
then the next column as so forth until the next match to heading 1 in the PDF/word doc. The names of the worksheets match the first 2 letters of or worksheet, which, in turn match the 2 letters between the () in the name of the file. I'm looping through all the *.pdf files in a dir chosen from a bff dialog. Here is my code so far:
Sub ExtractFromPDF()
Dim folderPath As String
Dim fileName As String
Dim wdApp As Object, wdDoc As Object
Dim xlWS As Worksheet
Dim header As String, extractedText As String
Dim cell As Range
Dim sectionA As String, sectionB As String
Dim sheet As Worksheet
Dim fileCode As String
Dim missingPDFs As String
missingPDFs = ""
Debug.Print "Starting ExtractFromPDF routine."
' Browse for folder path containing PDFs
folderPath = BrowseForFolder("Select PDF Directory", "C:\temp\pdffiles")
If folderPath = "" Then Exit Sub
Debug.Print "Selected folder: " & folderPath
' Initialize Word Application
On Error Resume Next
Set wdApp = GetObject(, "Word.Application")
If wdApp Is Nothing Then Set wdApp = CreateObject("Word.Application")
On Error GoTo ErrorHandler
If wdApp Is Nothing Then
MsgBox "Microsoft Word is not installed or accessible.", vbCritical
Exit Sub
End If
' Allow breaking out of the loop
Application.EnableCancelKey = xlErrorHandler
' Loop through PDF files
fileName = Dir(folderPath & "\*.pdf")
Do While fileName <> ""
Debug.Print "Processing file: " & fileName
Set xlWS = Nothing
' Extract the two letters within parentheses from the file name
fileCode = ""
If InStr(fileName, "(") > 0 And InStr(fileName, ")") > 0 Then
fileCode = Mid(fileName, InStr(fileName, "(") + 1, 2)
Debug.Print "Extracted file code: " & fileCode
Else
missingPDFs = missingPDFs & "Invalid filename format: " & fileName & vbCrLf
Debug.Print "Invalid filename format: " & fileName
GoTo NextFile
End If
' Find the matching worksheet by first two letters
For Each sheet In ThisWorkbook.Sheets
If UCase(Left(sheet.Name, 2)) = UCase(fileCode) Then
Set xlWS = sheet
Debug.Print "Matched worksheet: " & sheet.Name
Exit For
End If
Next sheet
If xlWS Is Nothing Then
missingPDFs = missingPDFs & "No matching worksheet for: " & fileName & vbCrLf
Debug.Print "No matching worksheet for: " & fileName
GoTo NextFile
End If
' Open PDF with Word
Set wdDoc = wdApp.Documents.Open(folderPath & "\" & fileName, ReadOnly:=True)
' Ensure Word document opened successfully
If wdDoc Is Nothing Then
missingPDFs = missingPDFs & "Failed to open in Word: " & fileName & vbCrLf
Debug.Print "Failed to open Word document: " & fileName
GoTo NextFile
End If
' Process Word document content
extractedText = ""
Dim para As Object
For Each para In wdDoc.Paragraphs
Dim rng As Object
Set rng = para.Range
' Check for Bold Arial 12 followed by colon
If rng.Font.Bold = True And rng.Font.Name = "Arial" And rng.Font.Size = 12 Then
If InStr(rng.Text, ":") > 0 Then
header = Trim(Split(rng.Text, ":")(0))
extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1))
Debug.Print "Extracted header: " & header
Debug.Print "Extracted text: " & extractedText
' Parse section A and B from header
If InStr(header, " ") > 0 Then
sectionA = Trim(Split(header, " ")(0))
sectionB = Trim(Mid(header, InStr(header, " ") + 1))
Else
sectionA = header
sectionB = ""
End If
' Match section A + space + B in Excel
Dim rowNum As Long
rowNum = 1
Do While xlWS.Cells(rowNum, 1).Value <> ""
Dim combinedAB As String
combinedAB = Trim(xlWS.Cells(rowNum, 1).Value) & " " & Trim(xlWS.Cells(rowNum, 1).Value)
' Print current content being matched
Debug.Print "Matching PDF: " & extractedText & " with Excel: " & combinedAB
If NormalizeText(combinedAB) = NormalizeText(sectionA & " " & sectionB) Then
' Compare and display if different (ignoring punctuation, spaces, and line breaks)
If NormalizeText(xlWS.Cells(rowNum, 3).Value) <> NormalizeText(extractedText) Then
Debug.Print "Discrepancy in " & xlWS.Cells(rowNum, 3).Address & " (Sheet: " & xlWS.Name & "): " & extractedText
End If
Exit Do
End If
rowNum = rowNum + 1
Loop
End If
End If
Next para
' Close Word Document
wdDoc.Close False
NextFile:
' Next PDF
fileName = Dir
Loop
' Report missing PDFs
If missingPDFs <> "" Then
MsgBox "Issues encountered: " & vbCrLf & missingPDFs, vbExclamation
End If
' Cleanup
If Not wdApp Is Nothing Then wdApp.Quit
Set wdApp = Nothing
MsgBox "Processing Complete!"
Debug.Print "Processing complete."
ExitSub:
If Not wdApp Is Nothing Then wdApp.Quit
Set wdApp = Nothing
MsgBox "Process Interrupted!"
Debug.Print "Process interrupted."
Exit Sub
ErrorHandler:
Debug.Print "Error encountered: " & Err.Number & " - " & Err.Description
If Err.Number = 18 Then Resume ExitSub
MsgBox "Error: " & Err.Description
Resume Next
End Sub
Function NormalizeText(ByVal txt As String) As String
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "[^a-zA-Z0-9()\-]"
regex.Global = True
txt = regex.Replace(txt, "")
NormalizeText = txt
End Function
Function BrowseForFolder(prompt As String, Optional defaultPath As String = "") As String
Dim shellApp As Object
Set shellApp = CreateObject("Shell.Application")
Dim folder As Object
' Debug: Check if the defaultPath is passed and if it exists
Debug.Print "Initial defaultPath: " & defaultPath
' Check if the defaultPath is valid and exists
If defaultPath <> "" Then
' Ensure the path exists
If Dir(defaultPath, vbDirectory) = "" Then
MsgBox "The specified default path does not exist: " & defaultPath, vbExclamation
defaultPath = "" ' Reset to empty if invalid path
Else
' Change drive and directory if path exists
On Error Resume Next
ChDrive Left(defaultPath, 1)
ChDir defaultPath
On Error GoTo 0
End If
End If
' Show folder browse dialog
Set folder = shellApp.BrowseForFolder(0, prompt, 0)
' If a folder is selected, return the path
If Not folder Is Nothing Then
BrowseForFolder = folder.Self.Path
Debug.Print "Folder selected: " & BrowseForFolder
Else
' If no folder is selected, return an empty string
BrowseForFolder = ""
Debug.Print "No folder selected."
End If
End Function
Sub ClearImmediateWindow()
On Error GoTo ErrorHandler
' Attempt to reset the Immediate window
Application.VBE.CommandBars("Immediate").Reset
Exit Sub
ErrorHandler:
' Handle specific error here
MsgBox "Error occurred: " & Err.Description, vbCritical
End Sub
The problem I'm having is, the extractedpath and header variables don't appear to be getting set. the debug.prints don't ever show in the immediate window. I just get
Matching PDF:
with Excel: Column1value Column2value
over and over again going through all the Columns buy not matching to the section in the word document.
Does anyone see anything I'm missing or a better way to do this? I know it's a bit complicated, so if any clarifications need to be made, feel free to ask in comments.
I have a requirement that takes place on a frequent enough basis and takes days to complete. I have an excel document that needs to be updated with data from PDF files. They are just basic text with no major formatting other than a toc and Headings. The sections I want to pull out Have Heading 1 that match the first and second column in the Excel file, Then they are in bold arial 12 pt: font followed by a colon in the pdf/word doc. That is the only place in the document that exists. I need everything between that until the next Heading 1 / bold Arail 12 pt: So, everything between that goes in the matching row column in excel.
It will basically look Like this:
Stuff - page 1
TOC
Header page
First page
Heading 1 - (AB) First Heading
First section in bold arial 12:
Stuff between
usually a numbered or bulleted list
1. List item
2. List item2
Next Heading1 (CD) Second heading
second section in bold arial 12:
Stuff between
usually a numbered or bulleted list
- List item
- List item2
then the next column as so forth until the next match to heading 1 in the PDF/word doc. The names of the worksheets match the first 2 letters of or worksheet, which, in turn match the 2 letters between the () in the name of the file. I'm looping through all the *.pdf files in a dir chosen from a bff dialog. Here is my code so far:
Sub ExtractFromPDF()
Dim folderPath As String
Dim fileName As String
Dim wdApp As Object, wdDoc As Object
Dim xlWS As Worksheet
Dim header As String, extractedText As String
Dim cell As Range
Dim sectionA As String, sectionB As String
Dim sheet As Worksheet
Dim fileCode As String
Dim missingPDFs As String
missingPDFs = ""
Debug.Print "Starting ExtractFromPDF routine."
' Browse for folder path containing PDFs
folderPath = BrowseForFolder("Select PDF Directory", "C:\temp\pdffiles")
If folderPath = "" Then Exit Sub
Debug.Print "Selected folder: " & folderPath
' Initialize Word Application
On Error Resume Next
Set wdApp = GetObject(, "Word.Application")
If wdApp Is Nothing Then Set wdApp = CreateObject("Word.Application")
On Error GoTo ErrorHandler
If wdApp Is Nothing Then
MsgBox "Microsoft Word is not installed or accessible.", vbCritical
Exit Sub
End If
' Allow breaking out of the loop
Application.EnableCancelKey = xlErrorHandler
' Loop through PDF files
fileName = Dir(folderPath & "\*.pdf")
Do While fileName <> ""
Debug.Print "Processing file: " & fileName
Set xlWS = Nothing
' Extract the two letters within parentheses from the file name
fileCode = ""
If InStr(fileName, "(") > 0 And InStr(fileName, ")") > 0 Then
fileCode = Mid(fileName, InStr(fileName, "(") + 1, 2)
Debug.Print "Extracted file code: " & fileCode
Else
missingPDFs = missingPDFs & "Invalid filename format: " & fileName & vbCrLf
Debug.Print "Invalid filename format: " & fileName
GoTo NextFile
End If
' Find the matching worksheet by first two letters
For Each sheet In ThisWorkbook.Sheets
If UCase(Left(sheet.Name, 2)) = UCase(fileCode) Then
Set xlWS = sheet
Debug.Print "Matched worksheet: " & sheet.Name
Exit For
End If
Next sheet
If xlWS Is Nothing Then
missingPDFs = missingPDFs & "No matching worksheet for: " & fileName & vbCrLf
Debug.Print "No matching worksheet for: " & fileName
GoTo NextFile
End If
' Open PDF with Word
Set wdDoc = wdApp.Documents.Open(folderPath & "\" & fileName, ReadOnly:=True)
' Ensure Word document opened successfully
If wdDoc Is Nothing Then
missingPDFs = missingPDFs & "Failed to open in Word: " & fileName & vbCrLf
Debug.Print "Failed to open Word document: " & fileName
GoTo NextFile
End If
' Process Word document content
extractedText = ""
Dim para As Object
For Each para In wdDoc.Paragraphs
Dim rng As Object
Set rng = para.Range
' Check for Bold Arial 12 followed by colon
If rng.Font.Bold = True And rng.Font.Name = "Arial" And rng.Font.Size = 12 Then
If InStr(rng.Text, ":") > 0 Then
header = Trim(Split(rng.Text, ":")(0))
extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1))
Debug.Print "Extracted header: " & header
Debug.Print "Extracted text: " & extractedText
' Parse section A and B from header
If InStr(header, " ") > 0 Then
sectionA = Trim(Split(header, " ")(0))
sectionB = Trim(Mid(header, InStr(header, " ") + 1))
Else
sectionA = header
sectionB = ""
End If
' Match section A + space + B in Excel
Dim rowNum As Long
rowNum = 1
Do While xlWS.Cells(rowNum, 1).Value <> ""
Dim combinedAB As String
combinedAB = Trim(xlWS.Cells(rowNum, 1).Value) & " " & Trim(xlWS.Cells(rowNum, 1).Value)
' Print current content being matched
Debug.Print "Matching PDF: " & extractedText & " with Excel: " & combinedAB
If NormalizeText(combinedAB) = NormalizeText(sectionA & " " & sectionB) Then
' Compare and display if different (ignoring punctuation, spaces, and line breaks)
If NormalizeText(xlWS.Cells(rowNum, 3).Value) <> NormalizeText(extractedText) Then
Debug.Print "Discrepancy in " & xlWS.Cells(rowNum, 3).Address & " (Sheet: " & xlWS.Name & "): " & extractedText
End If
Exit Do
End If
rowNum = rowNum + 1
Loop
End If
End If
Next para
' Close Word Document
wdDoc.Close False
NextFile:
' Next PDF
fileName = Dir
Loop
' Report missing PDFs
If missingPDFs <> "" Then
MsgBox "Issues encountered: " & vbCrLf & missingPDFs, vbExclamation
End If
' Cleanup
If Not wdApp Is Nothing Then wdApp.Quit
Set wdApp = Nothing
MsgBox "Processing Complete!"
Debug.Print "Processing complete."
ExitSub:
If Not wdApp Is Nothing Then wdApp.Quit
Set wdApp = Nothing
MsgBox "Process Interrupted!"
Debug.Print "Process interrupted."
Exit Sub
ErrorHandler:
Debug.Print "Error encountered: " & Err.Number & " - " & Err.Description
If Err.Number = 18 Then Resume ExitSub
MsgBox "Error: " & Err.Description
Resume Next
End Sub
Function NormalizeText(ByVal txt As String) As String
Dim regex As Object
Set regex = CreateObject("VBScript.RegExp")
regex.Pattern = "[^a-zA-Z0-9()\-]"
regex.Global = True
txt = regex.Replace(txt, "")
NormalizeText = txt
End Function
Function BrowseForFolder(prompt As String, Optional defaultPath As String = "") As String
Dim shellApp As Object
Set shellApp = CreateObject("Shell.Application")
Dim folder As Object
' Debug: Check if the defaultPath is passed and if it exists
Debug.Print "Initial defaultPath: " & defaultPath
' Check if the defaultPath is valid and exists
If defaultPath <> "" Then
' Ensure the path exists
If Dir(defaultPath, vbDirectory) = "" Then
MsgBox "The specified default path does not exist: " & defaultPath, vbExclamation
defaultPath = "" ' Reset to empty if invalid path
Else
' Change drive and directory if path exists
On Error Resume Next
ChDrive Left(defaultPath, 1)
ChDir defaultPath
On Error GoTo 0
End If
End If
' Show folder browse dialog
Set folder = shellApp.BrowseForFolder(0, prompt, 0)
' If a folder is selected, return the path
If Not folder Is Nothing Then
BrowseForFolder = folder.Self.Path
Debug.Print "Folder selected: " & BrowseForFolder
Else
' If no folder is selected, return an empty string
BrowseForFolder = ""
Debug.Print "No folder selected."
End If
End Function
Sub ClearImmediateWindow()
On Error GoTo ErrorHandler
' Attempt to reset the Immediate window
Application.VBE.CommandBars("Immediate").Reset
Exit Sub
ErrorHandler:
' Handle specific error here
MsgBox "Error occurred: " & Err.Description, vbCritical
End Sub
The problem I'm having is, the extractedpath and header variables don't appear to be getting set. the debug.prints don't ever show in the immediate window. I just get
Matching PDF:
with Excel: Column1value Column2value
over and over again going through all the Columns buy not matching to the section in the word document.
Does anyone see anything I'm missing or a better way to do this? I know it's a bit complicated, so if any clarifications need to be made, feel free to ask in comments.
Share Improve this question edited Mar 28 at 16:25 Tim Williams 167k8 gold badges100 silver badges141 bronze badges asked Mar 28 at 15:27 Matt WilliamsonMatt Williamson 7,1191 gold badge25 silver badges38 bronze badges 4 |1 Answer
Reset to default 1If the converted content really does include content in the Heading1 Style, all you need is some simple Word VBA code like:
Sub GetHeadingSpanText()
Dim RngHd As Range, strOut As String
With ActiveDocument.Range
With .Find
.ClearFormatting
.Replacement.ClearFormatting
.Style = wdStyleHeading1
.Replacement.Text = ""
.Wrap = wdFindStop
.Forward = True
.Format = True
.Text = ""
End With
Do While .Find.Execute
Set RngHd = .Paragraphs(1).Range
Set RngHd = RngHd.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
RngHd.Start = RngHd.Paragraphs.First.Range.End: strOut = RngHd.Text: MsgBox strOut
.Collapse wdCollapseEnd
Loop
End With
Set RngHd = Nothing
End Sub
If your 12pt Arial Bold content isn't actually a Heading Style, all you need is:
Sub GetHeadingSpanText()
Dim RngHd As Range, strOut As String
With ActiveDocument.Range
With .Find
.ClearFormatting
.Replacement.ClearFormatting
.Replacement.Text = ""
.Text = ""
.Wrap = wdFindContinue
.Forward = True
.Format = True
With .Font
.Name = "Arial"
.Size = 12
.Bold = 12
End With
.Replacement.Style = wdStyleHeading1
.Execute Replace:=wdReplaceAll
.ClearFormatting
.Replacement.ClearFormatting
.Style = wdStyleHeading1
.Wrap = wdFindStop
End With
Do While .Find.Execute
Set RngHd = .Paragraphs(1).Range
Set RngHd = RngHd.GoTo(What:=wdGoToBookmark, Name:="\HeadingLevel")
RngHd.Start = RngHd.Paragraphs.First.Range.End: strOut = RngHd.Text: MsgBox strOut
.Collapse wdCollapseEnd
Loop
End With
Set RngHd = Nothing
End Sub
I'll leave you to incorporate the Word code into your existing process.
extractedText = Trim(Mid(rng.Text, InStr(rng.Text, ":") + 1))
The paragraph ends with ":" though, so what text are you trying to get here? You're really interested in the text after the header paragraph. It would be useful if you could share an example Word doc (suitably redacted) for folks to test with. – Tim Williams Commented Mar 28 at 16:38Find
function to go straight to the heading. Search for the style Heading 1 as you have stated that is what is used. You can then use a predefined bookmark to return all the text for that heading. – Timothy Rylatt Commented Mar 28 at 17:04