elyx commited on
Commit
fc2bb77
Β·
1 Parent(s): 75b1ee5

add examples, math, and auth

Browse files
app.py CHANGED
@@ -80,7 +80,7 @@ class Chatbot():
80
  d.metadata['page'] = ''
81
 
82
  output = ' '.join([
83
- f'SOURCE {i}\n' + d.page_content + '\n\nSource: ' + d.metadata['source'] + '\nPage: ' + str(d.metadata['page']) + '\n\n\n'
84
  for i, d in enumerate(documents)
85
  ])
86
 
@@ -207,6 +207,14 @@ class Chatbot():
207
  submit.click(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
208
  message.submit(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
209
 
 
 
 
 
 
 
 
 
210
 
211
  with gr.Tab("Search"):
212
 
@@ -244,7 +252,7 @@ class Chatbot():
244
  message.submit(self.chat_vanilla, inputs=[message, vanilla_state, model], outputs=[vanilla_chatbot, vanilla_state])
245
 
246
 
247
- block.launch(debug=True, share=False)
248
 
249
 
250
  if __name__ == '__main__':
 
80
  d.metadata['page'] = ''
81
 
82
  output = ' '.join([
83
+ f'SOURCE {i}\n' + d.page_content.replace('$', '') + '\n\nSource: ' + d.metadata['source'] + '\nPage: ' + str(d.metadata['page']) + '\n\n\n' + '-'*100
84
  for i, d in enumerate(documents)
85
  ])
86
 
 
207
  submit.click(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
208
  message.submit(self.chat, inputs=[message, state, module, model], outputs=[chatbot, state])
209
 
210
+ gr.Examples(
211
+ examples=[
212
+ 'Answer the following question, explain your reasoning:\n'
213
+ 'Answer the following question, explain your reasoning, use latex format:\n'
214
+ 'Answer the following multiple choice question, explain your reasoning:\n'
215
+ ],
216
+ inputs=message
217
+ )
218
 
219
  with gr.Tab("Search"):
220
 
 
252
  message.submit(self.chat_vanilla, inputs=[message, vanilla_state, model], outputs=[vanilla_chatbot, vanilla_state])
253
 
254
 
255
+ block.launch(debug=True, share=False, auth=("bread", os.environ.get('PASSWORD')))
256
 
257
 
258
  if __name__ == '__main__':
chroma/chroma-collections.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:df46d39377ae5665c6f52890a51c1b39d55ce5cca43006a62d775f558d01a3d7
3
  size 557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a1c64656c27f073dc82f7d7e20cd31059ed38e6ac344408db5c052e56983ee59
3
  size 557
chroma/chroma-embeddings.parquet CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ed9ecd5766659fab4de819f1149e7022fa7bac9700fed908c8bb8cd7fbee2f18
3
- size 5751374
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1655cb7989cce3e21c033127f334c095e5cca13458d9318ed1544bfed657f005
3
+ size 7266587
chroma/index/{id_to_uuid_48820301-4b52-46b2-8746-e343bf602b95.pkl β†’ id_to_uuid_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d94ee3ba8d5ddcb17a8ca31845f96988cae28412304707b61389d72bc38d1a1c
3
- size 14648
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d54e466950cbc8b35a45821e139193bebccca0124a74635335341ad6f2dbf9
3
+ size 18544
chroma/index/{index_48820301-4b52-46b2-8746-e343bf602b95.bin β†’ index_33066827-7eb2-42ca-8a41-4459ce4b0011.bin} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8cd4d746421153bac9a1ee713618ae655718472fb597caf0a54998fd08908c25
3
- size 2882380
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f58846f6d4fa248d0775e83d21968429f064925379b0e32aa8696719df67c46
3
+ size 3631264
chroma/index/{index_metadata_48820301-4b52-46b2-8746-e343bf602b95.pkl β†’ index_metadata_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7384c6ad2206ea30195640a1aa2b8721341f26aae78fa7fc9f584705dbceb88c
3
  size 74
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:55a26232ed945d77cb53a3db5983204243c0987f20fc0ca9b7a98b294b3956d1
3
  size 74
chroma/index/{uuid_to_id_48820301-4b52-46b2-8746-e343bf602b95.pkl β†’ uuid_to_id_33066827-7eb2-42ca-8a41-4459ce4b0011.pkl} RENAMED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c28815d0892801f4674238321a1bfe266dd5f4f802bfd85faea9cc82c3dad51f
3
- size 17165
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dd5b8c41ef8f11902aed5a95e89fd585402cf1f072216ae9794c9b950f939e7d
3
+ size 21687
ingest.py CHANGED
@@ -3,7 +3,7 @@ import argparse
3
  import os
4
 
5
  from langchain.document_loaders import PyPDFLoader
6
- from langchain.text_splitter import CharacterTextSplitter
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.vectorstores import Chroma
9
  from langchain.schema import Document
@@ -16,6 +16,7 @@ parser.add_argument("folder", help="The folder to be ingested", type=str)
16
  parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
17
  parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
18
  parser.add_argument('--separator', help='Separator', type=str, default='\n')
 
19
 
20
  args = parser.parse_args()
21
 
@@ -23,6 +24,7 @@ FOLDER = args.folder
23
  CHUNK_SIZE = args.chunk_size
24
  CHUNK_OVERLAP = args.chunk_overlap
25
  SEPARATOR = args.separator
 
26
 
27
 
28
  class Ingest():
@@ -33,7 +35,14 @@ class Ingest():
33
  chunk_size,
34
  separator,
35
  chunk_overlap,
 
36
  ):
 
 
 
 
 
 
37
  self.folder = folder
38
  self.chunk_size = chunk_size
39
 
@@ -46,6 +55,15 @@ class Ingest():
46
  length_function = len,
47
  )
48
 
 
 
 
 
 
 
 
 
 
49
 
50
  def ingest(self):
51
  # find all .pdf files in the data folder
@@ -65,6 +83,12 @@ class Ingest():
65
  with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
66
  documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
67
 
 
 
 
 
 
 
68
  for i in documents:
69
  i.metadata['module'] = self.folder
70
 
@@ -75,7 +99,7 @@ class Ingest():
75
  embeddings = OpenAIEmbeddings()
76
  # create store
77
  print("Embedding chunks...")
78
- Chroma.from_documents(chunks, embeddings, persist_directory='./chroma')
79
 
80
  if __name__ == "__main__":
81
  ingest = Ingest(
@@ -83,5 +107,6 @@ if __name__ == "__main__":
83
  chunk_size = CHUNK_SIZE,
84
  separator = SEPARATOR,
85
  chunk_overlap = CHUNK_OVERLAP,
 
86
  )
87
  ingest.ingest()
 
3
  import os
4
 
5
  from langchain.document_loaders import PyPDFLoader
6
+ from langchain.text_splitter import CharacterTextSplitter, LatexTextSplitter
7
  from langchain.embeddings.openai import OpenAIEmbeddings
8
  from langchain.vectorstores import Chroma
9
  from langchain.schema import Document
 
16
  parser.add_argument("--chunk_size", help="Chunk size", type=int, default=1500)
17
  parser.add_argument('--chunk_overlap', help='Chunk overlap', type=int, default=400)
18
  parser.add_argument('--separator', help='Separator', type=str, default='\n')
19
+ parser.add_argument('--use_tex_splitter', help='Use tex splitter', type=bool, default=False)
20
 
21
  args = parser.parse_args()
22
 
 
24
  CHUNK_SIZE = args.chunk_size
25
  CHUNK_OVERLAP = args.chunk_overlap
26
  SEPARATOR = args.separator
27
+ USE_TEX_SPLITTER = args.use_tex_splitter
28
 
29
 
30
  class Ingest():
 
35
  chunk_size,
36
  separator,
37
  chunk_overlap,
38
+ use_tex_splitter,
39
  ):
40
+ self.vectorstore = Chroma(persist_directory='./chroma', embedding_function=OpenAIEmbeddings())
41
+ print(f"Count of {self.vectorstore._collection.count()} in vectostore")
42
+ print(f"Deleting previous items from {folder}")
43
+ self.vectorstore._collection.delete(where={'module' : folder})
44
+ print(f"New count, {self.vectorstore._collection.count()}")
45
+
46
  self.folder = folder
47
  self.chunk_size = chunk_size
48
 
 
55
  length_function = len,
56
  )
57
 
58
+ if use_tex_splitter:
59
+ self.splitter = LatexTextSplitter(
60
+ chunk_size = chunk_size,
61
+ chunk_overlap = chunk_overlap,
62
+ )
63
+
64
+ def _load_tex(self, path):
65
+ with open(path, "r") as f:
66
+ return f.read()
67
 
68
  def ingest(self):
69
  # find all .pdf files in the data folder
 
83
  with open(os.path.join('./data', os.path.join(self.folder, t)), "r") as f:
84
  documents.append(Document(page_content=f.read(), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
85
 
86
+ # tex
87
+ texfiles = [f for f in os.listdir(os.path.join('./data', self.folder)) if f.endswith(".tex")]
88
+ for t in texfiles:
89
+ documents.append(Document(page_content=self._load_tex(os.path.join('./data', os.path.join(self.folder, t))), metadata={"source": os.path.basename(t).split(".")[0] + ' transcript'}))
90
+
91
+
92
  for i in documents:
93
  i.metadata['module'] = self.folder
94
 
 
99
  embeddings = OpenAIEmbeddings()
100
  # create store
101
  print("Embedding chunks...")
102
+ self.vectorstore.add_texts(texts=[d.page_content for d in chunks], metadatas=[d.metadata for d in chunks])
103
 
104
  if __name__ == "__main__":
105
  ingest = Ingest(
 
107
  chunk_size = CHUNK_SIZE,
108
  separator = SEPARATOR,
109
  chunk_overlap = CHUNK_OVERLAP,
110
+ use_tex_splitter = USE_TEX_SPLITTER,
111
  )
112
  ingest.ingest()