import os from typing import List from langchain.schema.document import Document from langchain_community.document_loaders import PyMuPDFLoader import time def process_pdf(content: str, file_name: str,db_type:str, root_dir: str, text_splitter,database) -> List[Document]: # Save the PDF content to a temporary file temp_pdf_path = os.path.join(root_dir, f"temp_{time.time()}.pdf") with open(temp_pdf_path, "wb") as f: f.write(content.encode('latin1')) # Assuming content is base64 encoded try: # Load the PDF using PyPDFLoader loader = PyMuPDFLoader(temp_pdf_path) documents = loader.load() documents = text_splitter.split_documents(documents) for doc in documents: metadata = { "database":database, "db_type":db_type, "title": file_name, "type":"pdf", "parent_id": "null", "web_url": "null", "parent": "null", "source": file_name, "Summary":"null", "Subject": "null", "createdAt":"null", "tags":"null" } doc.metadata = metadata # Clean up the temporary file os.remove(temp_pdf_path) except Exception as e: print(e) os.remove(temp_pdf_path) documents = [] return documents