y2clutch / ingest.py
elyx's picture
initial commit
dae990d
raw
history blame
2.78 kB
import argparse
import os
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.schema import Document
# parse arguments
parser = argparse.ArgumentParser()
parser.add_argument("folder", help="The folder to be ingested", type=str)
parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
parser.add_argument('--separator', help='Separator', type=str, default='\n')
args = parser.parse_args()
FOLDER = args.folder
CHUNK_SIZE = args.chunk_size
CHUNK_OVERLAP = args.chunk_overlap
SEPARATOR = args.separator
class Ingest():
def __init__(
self,
folder,
chunk_size,
separator,
chunk_overlap,
):
self.folder = folder
self.chunk_size = chunk_size
self.data_path = os.path.join('./data', self.folder)
self.splitter = CharacterTextSplitter(
separator = separator,
chunk_size = chunk_size,
chunk_overlap = chunk_overlap,
length_function = len,
)
def ingest(self):
# find all .pdf files in the data folder
documents = []
# pdfs
pdffiles = [os.path.join(self.data_path, f) for f in os.listdir(self.data_path) if f.endswith(".pdf")]
for f in pdffiles:
loader = PyPDFLoader(f)
docs = loader.load()
for i in docs: i.metadata['source'] = os.path.basename(f).split(".")[0]
documents.extend(docs)
#txts
txtfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".txt")]
for t in txtfiles:
with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
for i in documents:
i.metadata['module'] = self.folder
# split texts into chunks
print("Splitting texts into chunks...")
chunks = self.splitter.split_documents(documents)
#[chunks.extend(self.splitter.split_documents(i)) for i in documents]
embeddings = OpenAIEmbeddings()
# create store
print("Embedding chunks...")
Chroma.from_documents(chunks, embeddings, persist_directory='./chroma')
if __name__ == "__main__":
ingest = Ingest(
folder = FOLDER,
chunk_size = CHUNK_SIZE,
separator = SEPARATOR,
chunk_overlap = CHUNK_OVERLAP,
)
ingest.ingest()