最新消息:雨落星辰是一个专注网站SEO优化、网站SEO诊断、搜索引擎研究、网络营销推广、网站策划运营及站长类的自媒体原创博客

How to extract comments from a PDF and put them into an Excel sheet - Stack Overflow

programmeradmin0浏览0评论

I am trying to do as the topic says. According to some sources, it should be possible to extract comments in CSV format, but I only have FDF and XFDF formats available, and I've tried using both Acrobat Reader and PDFXchange Editor.

After creating an FDF file (I tried an XFDF file as well, but that file seems to only contain some decrypted interpretation of the text I'm after so my scripts can't find anything), I have tried to write both a Python script and a VBA macro to extract the data I need from the FDF file, but for some reason neither script manages to correctly paste the data into the Excel sheet even though they both manage to find it. I used print() and Msg Box to verify.

I have also tried importing the comments from the PDF file from Excel, using power queries, but it doesn't find comments specifically and any data it finds isn't structured anywhere close to presentable.

Below are first my VBA macro, and secondly my Python script:

VBA:

Sub ImportCommentsFromFDF()
    Dim fdfFilePath As String
    Dim excelFilePath As String
    Dim fdfFileNum As Integer
    Dim line As String
    Dim commentator As String
    Dim comment As String
    Dim commentDate As String
    Dim rowNum As Long
    Dim ws As Worksheet
    Dim wb As Workbook
    Dim commenters As Collection
    Dim comments As Collection
    Dim commentDates As Collection

    fdfFilePath = ThisWorkbook.Path & "\comments.fdf"
    excelFilePath = ThisWorkbook.Path & "\Comments.xlsx"
    
    If Dir(fdfFilePath) = "" Then
        MsgBox "Error: The file '" & fdfFilePath & "' does not exist."
        Exit Sub
    End If

    If Dir(excelFilePath) = "" Then
        MsgBox "Error: The file '" & excelFilePath & "' does not exist. A new file will be created."
        Exit Sub
    End If

    Set wb = Workbooks.Open(excelFilePath)
    Set ws = wb.Sheets("Sheet1")

    Set commenters = New Collection
    Set comments = New Collection
    Set commentDates = New Collection

    fdfFileNum = FreeFile
    Open fdfFilePath For Input As fdfFileNum
    
    Do While Not EOF(fdfFileNum)
        Line Input #fdfFileNum, line
        
        commentator = ""
        comment = ""
        commentDate = ""

        If InStr(line, "/T (") > 0 Then
            commentator = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Commentator: " & commentator
        End If
        
        If InStr(line, "/Contents (") > 0 Then
            comment = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Comment: " & comment
        End If
        

        If InStr(line, "/CreationDate (") > 0 Then
            commentDate = Trim(Mid(line, InStr(line, "(") + 1, InStr(line, ")") - InStr(line, "(") - 1))
            Debug.Print "Found Comment Date: " & commentDate
        End If
        
        If commentator <> "" And comment <> "" And commentDate <> "" Then
            commenters.Add commentator
            comments.Add comment
            commentDates.Add commentDate
        End If
    Loop
    
    Close fdfFileNum
    
    If commenters.Count = comments.Count And comments.Count = commentDates.Count Then
        rowNum = 2
        For i = 1 To commenters.Count
            ws.Cells(rowNum, 1).Value = commenters(i)
            ws.Cells(rowNum, 2).Value = comments(i)
            ws.Cells(rowNum, 3).Value = commentDates(i)
            rowNum = rowNum + 1
        Next i
        
        wb.Save
        MsgBox "Comments imported successfully!"
    Else
        MsgBox "Warning: The lists are not of the same length." & vbCrLf & _
               "Commenters: " & commenters.Count & vbCrLf & _
               "Comments: " & comments.Count & vbCrLf & _
               "Dates: " & commentDates.Count
    End If

    wb.Close SaveChanges:=True
End Sub

Python:

import pandas as pd
import os

def import_comments_from_fdf(fdf_file_path, excel_file_path):
    if not os.path.isfile(fdf_file_path):
        print(f"Error: The file '{fdf_file_path}' does not exist.")
        return

    if not os.path.isfile(excel_file_path):
        print(f"Error: The file '{excel_file_path}' does not exist. A new file will be created.")

    commenters = []
    comments = []
    comment_dates = []

    with open(fdf_file_path, 'r', encoding='ISO-8859-1') as fdf_file:
        for line in fdf_file:
            commentator = ""
            comment = ""
            comment_date = ""

            if '/T (' in line:
                commentator = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Commentator: {commentator}")

            if '/Contents (' in line:
                comment = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Comment: {comment}")

            if '/CreationDate (' in line:
                comment_date = line[line.index('(') + 1:line.index(')')].strip()
                print(f"Found Comment Date: {comment_date}")

            if commentator and comment and comment_date:
                commenters.append(commentator)
                comments.append(comment)
                comment_dates.append(comment_date)

    if len(commenters) == len(comments) == len(comment_dates):
        data = {
            'Commenter': commenters,
            'Comment': comments,
            'Date Created': comment_dates
        }
        df = pd.DataFrame(data)

        print("DataFrame to be written:")
        print(df)

        with pd.ExcelWriter(excel_file_path, engine='openpyxl', mode='a', if_sheet_exists='overlay') as writer:
            for index, row in df.iterrows():
                writer.sheets['Sheet1'].cell(row=index + 2, column=1, value=row['Commenter'])
                writer.sheets['Sheet1'].cell(row=index + 2, column=2, value=row['Comment'])
                writer.sheets['Sheet1'].cell(row=index + 2, column=3, value=row['Date Created'])

        print("Comments imported successfully!")
    else:
        print("Warning: The lists are not of the same length.")
        print(f"Commenters: {len(commenters)}, Comments: {len(comments)}, Dates: {len(comment_dates)}")

fdf_file_path = 'comments.fdf'
excel_file_path = 'Comments.xlsx'

import_comments_from_fdf(fdf_file_path, excel_file_path)

发布评论

评论列表(0)

  1. 暂无评论