I am writing a RAG chatbot that retrieves information from a given list of documents. The documents can be found in a set folder, and they could be either .pdf or .docx. I want to merge all the documents using the same vector store, but I am running into trouble with the MergeDataLoader because any given file could be either a .docx or a .pdf. Does anyone have a recommendation for solving this issue efficiently?
# Initialize an empty list to store loaded documents
docs = []
# Function to process a batch of PDF files
def process_pdf_batch(all_files):
batch_docs = []
for any_file_path in all_files:
if any_file_path.lower().endswith(".pdf"): # Implementation using one loader or the other
loader = PyPDFLoader(any_file_path)
elif any_file_path.lower().endswith(".docx"):
loader = Docx2txtLoader(any_file_path)
batch_docs.extend(loader.load())
# Implementation trying to combine both loaders
# pdf_loader = PyPDFLoader(any_file_path)
# doc_loader = Docx2txtLoader(any_file_path)
# all_loader = MergedDataLoader(loaders=[doc_loader, pdf_loader])
# batch_docs.extend(all_loader.load())
# pdf_loader = Docx2txtLoader(pdf_file_path)
# batch_docs.extend(pdf_loader.load())
return batch_docs
# Get the list of PDF files to process
pdf_files_to_process = []
for root, dirs, files in os.walk(root_directory):
pdf_files_to_process.extend([os.path.join(root, file) for file in files if (file.lower().endswith(".pdf") or file.lower().endswith(".docx"))])
total_files = len(pdf_files_to_process)
processed_files = 0
# Iterate through the PDF files in batches
for i in range(0, total_files, batch_size):
batch = pdf_files_to_process[i:i+batch_size]
batch_docs = list(process_pdf_batch(batch))
for batch_result in batch_docs:
docs.extend(batch_result)
processed_files += 1
print(f"Processed {processed_files} / {total_files} files")
I have tried using two different implementations: one where the individual types of loaders are used independently, and another where they are combined into a single loader.