add examples, math, and auth
Browse files- app.py +10 -2
- chroma/chroma-collections.parquet +1 -1
- chroma/chroma-embeddings.parquet +2 -2
- chroma/index/{id_to_uuid_48820301-4b52-46b2-8746-e343bf602b95.pkl β id_to_uuid_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} +2 -2
- chroma/index/{index_48820301-4b52-46b2-8746-e343bf602b95.bin β index_33066827-7eb2-42ca-8a41-4459ce4b0011.bin} +2 -2
- chroma/index/{index_metadata_48820301-4b52-46b2-8746-e343bf602b95.pkl β index_metadata_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} +1 -1
- chroma/index/{uuid_to_id_48820301-4b52-46b2-8746-e343bf602b95.pkl β uuid_to_id_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} +2 -2
- ingest.py +27 -2
app.py
CHANGED
@@ -80,7 +80,7 @@ class Chatbot():
|
|
80 |
d.metadata['page'] = ''
|
81 |
|
82 |
output = ' '.join([
|
83 |
-
f'SOURCE {i}\n' + d.page_content + '\n\nSource: ' + d.metadata['source'] + '\nPage: ' + str(d.metadata['page']) + '\n\n\n'
|
84 |
for i, d in enumerate(documents)
|
85 |
])
|
86 |
|
@@ -207,6 +207,14 @@ class Chatbot():
|
|
207 |
submit.click(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
|
208 |
message.submit(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
|
209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
|
211 |
with gr.Tab("Search"):
|
212 |
|
@@ -244,7 +252,7 @@ class Chatbot():
|
|
244 |
message.submit(self.chat_vanilla, inputs=[message, vanilla_state, model], outputs=[vanilla_chatbot, vanilla_state])
|
245 |
|
246 |
|
247 |
-
block.launch(debug=True, share=False)
|
248 |
|
249 |
|
250 |
if __name__ == '__main__':
|
|
|
80 |
d.metadata['page'] = ''
|
81 |
|
82 |
output = ' '.join([
|
83 |
+
f'SOURCE {i}\n' + d.page_content.replace('$', '') + '\n\nSource: ' + d.metadata['source'] + '\nPage: ' + str(d.metadata['page']) + '\n\n\n' + '-'*100
|
84 |
for i, d in enumerate(documents)
|
85 |
])
|
86 |
|
|
|
207 |
submit.click(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
|
208 |
message.submit(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
|
209 |
|
210 |
+
gr.Examples(
|
211 |
+
examples=[
|
212 |
+
'Answer the following question, explain your reasoning:\n'
|
213 |
+
'Answer the following question, explain your reasoning, use latex format:\n'
|
214 |
+
'Answer the following multiple choice question, explain your reasoning:\n'
|
215 |
+
],
|
216 |
+
inputs=message
|
217 |
+
)
|
218 |
|
219 |
with gr.Tab("Search"):
|
220 |
|
|
|
252 |
message.submit(self.chat_vanilla, inputs=[message, vanilla_state, model], outputs=[vanilla_chatbot, vanilla_state])
|
253 |
|
254 |
|
255 |
+
block.launch(debug=True, share=False, auth=("bread", os.environ.get('PASSWORD')))
|
256 |
|
257 |
|
258 |
if __name__ == '__main__':
|
chroma/chroma-collections.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 557
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:a1c64656c27f073dc82f7d7e20cd31059ed38e6ac344408db5c052e56983ee59
|
3 |
size 557
|
chroma/chroma-embeddings.parquet
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:1655cb7989cce3e21c033127f334c095e5cca13458d9318ed1544bfed657f005
|
3 |
+
size 7266587
|
chroma/index/{id_to_uuid_48820301-4b52-46b2-8746-e343bf602b95.pkl β id_to_uuid_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09d54e466950cbc8b35a45821e139193bebccca0124a74635335341ad6f2dbf9
|
3 |
+
size 18544
|
chroma/index/{index_48820301-4b52-46b2-8746-e343bf602b95.bin β index_33066827-7eb2-42ca-8a41-4459ce4b0011.bin}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f58846f6d4fa248d0775e83d21968429f064925379b0e32aa8696719df67c46
|
3 |
+
size 3631264
|
chroma/index/{index_metadata_48820301-4b52-46b2-8746-e343bf602b95.pkl β index_metadata_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 74
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:55a26232ed945d77cb53a3db5983204243c0987f20fc0ca9b7a98b294b3956d1
|
3 |
size 74
|
chroma/index/{uuid_to_id_48820301-4b52-46b2-8746-e343bf602b95.pkl β uuid_to_id_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl}
RENAMED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:dd5b8c41ef8f11902aed5a95e89fd585402cf1f072216ae9794c9b950f939e7d
|
3 |
+
size 21687
|
ingest.py
CHANGED
@@ -3,7 +3,7 @@ import argparse
|
|
3 |
import os
|
4 |
|
5 |
from langchain.document_loaders import PyPDFLoader
|
6 |
-
from langchain.text_splitter import CharacterTextSplitter
|
7 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
8 |
from langchain.vectorstores import Chroma
|
9 |
from langchain.schema import Document
|
@@ -16,6 +16,7 @@ parser.add_argument("folder", help="The folder to be ingested", type=str)
|
|
16 |
parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
|
17 |
parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
|
18 |
parser.add_argument('--separator', help='Separator', type=str, default='\n')
|
|
|
19 |
|
20 |
args = parser.parse_args()
|
21 |
|
@@ -23,6 +24,7 @@ FOLDER = args.folder
|
|
23 |
CHUNK_SIZE = args.chunk_size
|
24 |
CHUNK_OVERLAP = args.chunk_overlap
|
25 |
SEPARATOR = args.separator
|
|
|
26 |
|
27 |
|
28 |
class Ingest():
|
@@ -33,7 +35,14 @@ class Ingest():
|
|
33 |
chunk_size,
|
34 |
separator,
|
35 |
chunk_overlap,
|
|
|
36 |
):
|
|
|
|
|
|
|
|
|
|
|
|
|
37 |
self.folder = folder
|
38 |
self.chunk_size = chunk_size
|
39 |
|
@@ -46,6 +55,15 @@ class Ingest():
|
|
46 |
length_function = len,
|
47 |
)
|
48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
|
50 |
def ingest(self):
|
51 |
# find all .pdf files in the data folder
|
@@ -65,6 +83,12 @@ class Ingest():
|
|
65 |
with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
|
66 |
documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
|
67 |
|
|
|
|
|
|
|
|
|
|
|
|
|
68 |
for i in documents:
|
69 |
i.metadata['module'] = self.folder
|
70 |
|
@@ -75,7 +99,7 @@ class Ingest():
|
|
75 |
embeddings = OpenAIEmbeddings()
|
76 |
# create store
|
77 |
print("Embedding chunks...")
|
78 |
-
|
79 |
|
80 |
if __name__ == "__main__":
|
81 |
ingest = Ingest(
|
@@ -83,5 +107,6 @@ if __name__ == "__main__":
|
|
83 |
chunk_size = CHUNK_SIZE,
|
84 |
separator = SEPARATOR,
|
85 |
chunk_overlap = CHUNK_OVERLAP,
|
|
|
86 |
)
|
87 |
ingest.ingest()
|
|
|
3 |
import os
|
4 |
|
5 |
from langchain.document_loaders import PyPDFLoader
|
6 |
+
from langchain.text_splitter import CharacterTextSplitter, LatexTextSplitter
|
7 |
from langchain.embeddings.openai import OpenAIEmbeddings
|
8 |
from langchain.vectorstores import Chroma
|
9 |
from langchain.schema import Document
|
|
|
16 |
parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
|
17 |
parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
|
18 |
parser.add_argument('--separator', help='Separator', type=str, default='\n')
|
19 |
+
parser.add_argument('--use_tex_splitter', help='Use tex splitter', type=bool, default=False)
|
20 |
|
21 |
args = parser.parse_args()
|
22 |
|
|
|
24 |
CHUNK_SIZE = args.chunk_size
|
25 |
CHUNK_OVERLAP = args.chunk_overlap
|
26 |
SEPARATOR = args.separator
|
27 |
+
USE_TEX_SPLITTER = args.use_tex_splitter
|
28 |
|
29 |
|
30 |
class Ingest():
|
|
|
35 |
chunk_size,
|
36 |
separator,
|
37 |
chunk_overlap,
|
38 |
+
use_tex_splitter,
|
39 |
):
|
40 |
+
self.vectorstore = Chroma(persist_directory='./chroma', embedding_function=OpenAIEmbeddings())
|
41 |
+
print(f"Count of {self.vectorstore._collection.count()} in vectostore")
|
42 |
+
print(f"Deleting previous items from {folder}")
|
43 |
+
self.vectorstore._collection.delete(where={'module' : folder})
|
44 |
+
print(f"New count, {self.vectorstore._collection.count()}")
|
45 |
+
|
46 |
self.folder = folder
|
47 |
self.chunk_size = chunk_size
|
48 |
|
|
|
55 |
length_function = len,
|
56 |
)
|
57 |
|
58 |
+
if use_tex_splitter:
|
59 |
+
self.splitter = LatexTextSplitter(
|
60 |
+
chunk_size = chunk_size,
|
61 |
+
chunk_overlap = chunk_overlap,
|
62 |
+
)
|
63 |
+
|
64 |
+
def _load_tex(self, path):
|
65 |
+
with open(path, "r") as f:
|
66 |
+
return f.read()
|
67 |
|
68 |
def ingest(self):
|
69 |
# find all .pdf files in the data folder
|
|
|
83 |
with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
|
84 |
documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
|
85 |
|
86 |
+
# tex
|
87 |
+
texfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".tex")]
|
88 |
+
for t in texfiles:
|
89 |
+
documents.append(Document(page_content=self._load_tex(os.path.join('./data', os.path.join(self.folder, t))), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
|
90 |
+
|
91 |
+
|
92 |
for i in documents:
|
93 |
i.metadata['module'] = self.folder
|
94 |
|
|
|
99 |
embeddings = OpenAIEmbeddings()
|
100 |
# create store
|
101 |
print("Embedding chunks...")
|
102 |
+
self.vectorstore.add_texts(texts=[d.page_content for d in chunks], metadatas=[d.metadata for d in chunks])
|
103 |
|
104 |
if __name__ == "__main__":
|
105 |
ingest = Ingest(
|
|
|
107 |
chunk_size = CHUNK_SIZE,
|
108 |
separator = SEPARATOR,
|
109 |
chunk_overlap = CHUNK_OVERLAP,
|
110 |
+
use_tex_splitter = USE_TEX_SPLITTER,
|
111 |
)
|
112 |
ingest.ingest()
|