|
import argparse |
|
|
|
import os |
|
|
|
from langchain.document_loaders import PyPDFLoader |
|
from langchain.text_splitter import CharacterTextSplitter, LatexTextSplitter |
|
from langchain.embeddings.openai import OpenAIEmbeddings |
|
from langchain.vectorstores import Chroma |
|
from langchain.schema import Document |
|
|
|
|
|
|
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("folder", help="The folder to be ingested", type=str) |
|
parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500) |
|
parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400) |
|
parser.add_argument('--separator', help='Separator', type=str, default='\n') |
|
parser.add_argument('--use_tex_splitter', help='Use tex splitter', type=bool, default=False) |
|
|
|
args = parser.parse_args() |
|
|
|
FOLDER = args.folder |
|
CHUNK_SIZE = args.chunk_size |
|
CHUNK_OVERLAP = args.chunk_overlap |
|
SEPARATOR = args.separator |
|
USE_TEX_SPLITTER = args.use_tex_splitter |
|
|
|
|
|
class Ingest(): |
|
|
|
def __init__( |
|
self, |
|
folder, |
|
chunk_size, |
|
separator, |
|
chunk_overlap, |
|
use_tex_splitter, |
|
): |
|
self.vectorstore = Chroma(persist_directory='./chroma', embedding_function=OpenAIEmbeddings()) |
|
print(f"Count of {self.vectorstore._collection.count()} in vectostore") |
|
print(f"Deleting previous items from {folder}") |
|
self.vectorstore._collection.delete(where={'module' : folder}) |
|
print(f"New count, {self.vectorstore._collection.count()}") |
|
|
|
self.folder = folder |
|
self.chunk_size = chunk_size |
|
|
|
self.data_path = os.path.join('./data', self.folder) |
|
|
|
self.splitter = CharacterTextSplitter( |
|
separator = separator, |
|
chunk_size = chunk_size, |
|
chunk_overlap = chunk_overlap, |
|
length_function = len, |
|
) |
|
|
|
if use_tex_splitter: |
|
self.splitter = LatexTextSplitter( |
|
chunk_size = chunk_size, |
|
chunk_overlap = chunk_overlap, |
|
) |
|
|
|
def _load_tex(self, path): |
|
with open(path, "r") as f: |
|
return f.read() |
|
|
|
def ingest(self): |
|
|
|
|
|
documents = [] |
|
|
|
pdffiles = [os.path.join(self.data_path, f) for f in os.listdir(self.data_path) if f.endswith(".pdf")] |
|
for f in pdffiles: |
|
loader = PyPDFLoader(f) |
|
docs = loader.load() |
|
for i in docs: i.metadata['source'] = os.path.basename(f).split(".")[0] |
|
documents.extend(docs) |
|
|
|
|
|
txtfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".txt")] |
|
for t in txtfiles: |
|
with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f: |
|
documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'})) |
|
|
|
|
|
texfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".tex")] |
|
for t in texfiles: |
|
documents.append(Document(page_content=self._load_tex(os.path.join('./data', os.path.join(self.folder, t))), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'})) |
|
|
|
|
|
for i in documents: |
|
i.metadata['module'] = self.folder |
|
|
|
|
|
print("Splitting texts into chunks...") |
|
chunks = self.splitter.split_documents(documents) |
|
|
|
embeddings = OpenAIEmbeddings() |
|
|
|
print("Embedding chunks...") |
|
self.vectorstore.add_texts(texts=[d.page_content for d in chunks], metadatas=[d.metadata for d in chunks]) |
|
|
|
if __name__ == "__main__": |
|
ingest = Ingest( |
|
folder = FOLDER, |
|
chunk_size = CHUNK_SIZE, |
|
separator = SEPARATOR, |
|
chunk_overlap = CHUNK_OVERLAP, |
|
use_tex_splitter = USE_TEX_SPLITTER, |
|
) |
|
ingest.ingest() |