I am trying to do as the topic says. According to some sources, it should be possible to extract comments in CSV format, but I only have FDF and XFDF formats available, and I've tried using both Acrobat Reader and PDFXchange Editor.
After creating an FDF file (I tried an XFDF file as well, but that file seems to only contain some decrypted interpretation of the text I'm after so my scripts can't find anything), I have tried to write both a Python script and a VBA macro to extract the data I need from the FDF file, but for some reason neither script manages to correctly paste the data into the Excel sheet even though they both manage to find it. I used print() and Msg Box to verify.
I have also tried importing the comments from the PDF file from Excel, using power queries, but it doesn't find comments specifically and any data it finds isn't structured anywhere close to presentable.
Below are first my VBA macro, and secondly my Python script:
VBA:
Sub ImportCommentsFromFDF()
Dim fdfFilePath As String
Dim excelFilePath As String
Dim fdfFileNum As Integer
Dim line As String
Dim commentator As String
Dim comment As String
Dim commentDate As String
Dim rowNum As Long
Dim ws As Worksheet
Dim wb As Workbook
Dim commenters As Collection
Dim comments As Collection
Dim commentDates As Collection
fdfFilePath = ThisWorkbook.Path & "\comments.fdf"
excelFilePath = ThisWorkbook.Path & "\Comments.xlsx"
If Dir(fdfFilePath) = "" Then
MsgBox "Error: The file '" & fdfFilePath & "' does not exist."
Exit Sub
End If
If Dir(excelFilePath) = "" Then
MsgBox "Error: The file '" & excelFilePath & "' does not exist. A new file will be created."
Exit Sub
End If
Set wb = Workbooks.Open(excelFilePath)
Set ws = wb.Sheets("Sheet1")
Set commenters = New Collection
Set comments = New Collection
Set commentDates = New Collection
fdfFileNum = FreeFile
Open fdfFilePath For Input As fdfFileNum
Do While Not EOF(fdfFileNum)
Line Input #fdfFileNum, line
commentator = ""
comment = ""
commentDate = ""
If InStr(line, "/T (") > 0 Then
commentator = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
Debug.Print "Found Commentator: " & commentator
End If
If InStr(line, "/Contents (") > 0 Then
comment = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
Debug.Print "Found Comment: " & comment
End If
If InStr(line, "/CreationDate (") > 0 Then
commentDate = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
Debug.Print "Found Comment Date: " & commentDate
End If
If commentator <> "" And comment <> "" And commentDate <> "" Then
commenters.Add commentator
comments.Add comment
commentDates.Add commentDate
End If
Loop
Close fdfFileNum
If commenters.Count = comments.Count And comments.Count = commentDates.Count Then
rowNum = 2
For i = 1 To commenters.Count
ws.Cells(rowNum, 1).Value = commenters(i)
ws.Cells(rowNum, 2).Value = comments(i)
ws.Cells(rowNum, 3).Value = commentDates(i)
rowNum = rowNum + 1
Next i
wb.Save
MsgBox "Comments imported successfully!"
Else
MsgBox "Warning: The lists are not of the same length." & vbCrLf & _
"Commenters: " & commenters.Count & vbCrLf & _
"Comments: " & comments.Count & vbCrLf & _
"Dates: " & commentDates.Count
End If
wb.Close SaveChanges:=True
End Sub
Python:
import pandas as pd
import os
def import_comments_from_fdf(fdf_file_path, excel_file_path):
if not os.path.isfile(fdf_file_path):
print(f"Error: The file '{fdf_file_path}' does not exist.")
return
if not os.path.isfile(excel_file_path):
print(f"Error: The file '{excel_file_path}' does not exist. A new file will be created.")
commenters = []
comments = []
comment_dates = []
with open(fdf_file_path, 'r', encoding='ISO-8859-1') as fdf_file:
for line in fdf_file:
commentator = ""
comment = ""
comment_date = ""
if '/T (' in line:
commentator = line[line.index('(') + 1:line.index(')')].strip()
print(f"Found Commentator: {commentator}")
if '/Contents (' in line:
comment = line[line.index('(') + 1:line.index(')')].strip()
print(f"Found Comment: {comment}")
if '/CreationDate (' in line:
comment_date = line[line.index('(') + 1:line.index(')')].strip()
print(f"Found Comment Date: {comment_date}")
if commentator and comment and comment_date:
commenters.append(commentator)
comments.append(comment)
comment_dates.append(comment_date)
if len(commenters) == len(comments) == len(comment_dates):
data = {
'Commenter': commenters,
'Comment': comments,
'Date Created': comment_dates
}
df = pd.DataFrame(data)
print("DataFrame to be written:")
print(df)
with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
for index, row in df.iterrows():
writer.sheets['Sheet1'].cell(row=index + 2, column=1, value=row['Commenter'])
writer.sheets['Sheet1'].cell(row=index + 2, column=2, value=row['Comment'])
writer.sheets['Sheet1'].cell(row=index + 2, column=3, value=row['Date Created'])
print("Comments imported successfully!")
else:
print("Warning: The lists are not of the same length.")
print(f"Commenters: {len(commenters)}, Comments: {len(comments)}, Dates: {len(comment_dates)}")
fdf_file_path = 'comments.fdf'
excel_file_path = 'Comments.xlsx'
import_comments_from_fdf(fdf_file_path, excel_file_path)