import argparse import os from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import CharacterTextSplitter from langchain.embeddings.openai import OpenAIEmbeddings from langchain.vectorstores import Chroma from langchain.schema import Document # parse arguments parser = argparse.ArgumentParser() parser.add_argument("folder", help="The folder to be ingested", type=str) parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500) parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400) parser.add_argument('--separator', help='Separator', type=str, default='\n') args = parser.parse_args() FOLDER = args.folder CHUNK_SIZE = args.chunk_size CHUNK_OVERLAP = args.chunk_overlap SEPARATOR = args.separator class Ingest(): def __init__( self, folder, chunk_size, separator, chunk_overlap, ): self.folder = folder self.chunk_size = chunk_size self.data_path = os.path.join('./data', self.folder) self.splitter = CharacterTextSplitter( separator = separator, chunk_size = chunk_size, chunk_overlap = chunk_overlap, length_function = len, ) def ingest(self): # find all .pdf files in the data folder documents = [] # pdfs pdffiles = [os.path.join(self.data_path, f) for f in os.listdir(self.data_path) if f.endswith(".pdf")] for f in pdffiles: loader = PyPDFLoader(f) docs = loader.load() for i in docs: i.metadata['source'] = os.path.basename(f).split(".")[0] documents.extend(docs) #txts txtfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".txt")] for t in txtfiles: with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f: documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'})) for i in documents: i.metadata['module'] = self.folder # split texts into chunks print("Splitting texts into chunks...") chunks = self.splitter.split_documents(documents) #[chunks.extend(self.splitter.split_documents(i)) for i in documents] embeddings = OpenAIEmbeddings() # create store print("Embedding chunks...") Chroma.from_documents(chunks, embeddings, persist_directory='./chroma') if __name__ == "__main__": ingest = Ingest( folder = FOLDER, chunk_size = CHUNK_SIZE, separator = SEPARATOR, chunk_overlap = CHUNK_OVERLAP, ) ingest.ingest()