hayuh commited on
Commit
e38a9d9
·
verified ·
1 Parent(s): 81fae24

Upload 20 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,9 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Ehlers-Danlos-1/2024_EDS_2.pdf filter=lfs diff=lfs merge=lfs -text
37
+ Ehlers-Danlos-1/2024_EDS_3.pdf filter=lfs diff=lfs merge=lfs -text
38
+ Ehlers-Danlos-1/2024_EDS_4.pdf filter=lfs diff=lfs merge=lfs -text
39
+ Ehlers-Danlos-1/2024_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text
40
+ Ehlers-Danlos-1/Unknown_EDS_1.pdf filter=lfs diff=lfs merge=lfs -text
41
+ Ehlers-Danlos-1/Unknown_EDS_5.pdf filter=lfs diff=lfs merge=lfs -text
Ehlers-Danlos-1/2024_EDS_1.pdf ADDED
The diff for this file is too large to render. See raw diff
 
Ehlers-Danlos-1/2024_EDS_2.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:46fc736ff4174473e0a846b7ca8430c140d89cd2c9f663e105bc48b33f8d9c99
3
+ size 2616000
Ehlers-Danlos-1/2024_EDS_3.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7fef5c8c375297158ad7ad63166405ca7ce4ac511371a8454fe9df972755b0fe
3
+ size 10344738
Ehlers-Danlos-1/2024_EDS_4.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25db35c77fd6aeba6b15278671a462b30ffbb6f97eb5f221e0459f6d11c0f8ed
3
+ size 1071576
Ehlers-Danlos-1/2024_EDS_5.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ef98bcb445da6abda66de35204634bd81d8c6dcdf53bfc3be54447ec9ad0ad
3
+ size 2772421
Ehlers-Danlos-1/2024_EDS_6.pdf ADDED
Binary file (146 kB). View file
 
Ehlers-Danlos-1/2024_EDS_7.pdf ADDED
The diff for this file is too large to render. See raw diff
 
Ehlers-Danlos-1/Unknown_EDS_1.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:dbeaf13d3298a00bc1c7acfba3177a0c639f677e0f0941452709fe60542052d4
3
+ size 21553835
Ehlers-Danlos-1/Unknown_EDS_2.pdf ADDED
Binary file (428 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_3.pdf ADDED
Binary file (817 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_4.pdf ADDED
Binary file (392 kB). View file
 
Ehlers-Danlos-1/Unknown_EDS_5.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5c5a77524b6bb4dca40798af5ff3e3c622216a13ac21a60d9befce255977b47a
3
+ size 1847313
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ from pathlib import Path
4
+ import gradio as gr
5
+ import nest_asyncio
6
+ import dill as pickle
7
+
8
+ # Ensure async compatibility in Jupyter
9
+ nest_asyncio.apply()
10
+
11
+ # Import OpenAI key with helper function
12
+ from helper import get_openai_api_key
13
+ OPENAI_API_KEY = get_openai_api_key()
14
+
15
+ # Define the path to the directory containing the PDF files
16
+ folder_path = 'Ehlers-Danlos-1'
17
+
18
+ # Get the list of all PDF files in the directory
19
+ pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))
20
+ print(pdf_files)
21
+
22
+ # Extract just the filenames (optional)
23
+ pdf_filenames = [os.path.basename(pdf) for pdf in pdf_files]
24
+ print(pdf_filenames)
25
+
26
+ # Import utilities
27
+ from utils import get_doc_tools
28
+
29
+ # Truncate function names if necessary
30
+ def truncate_function_name(name, max_length=64):
31
+ return name if len(name) <= max_length else name[:max_length]
32
+
33
+ # Path to save/load serialized tools
34
+ tools_cache_path = 'tools_cache.pkl'
35
+
36
+ # Initialize paper_to_tools_dict
37
+ paper_to_tools_dict = {}
38
+
39
+ # Check if the cache file exists and is not empty
40
+ if os.path.exists(tools_cache_path) and os.path.getsize(tools_cache_path) > 0:
41
+ try:
42
+ with open(tools_cache_path, 'rb') as f:
43
+ paper_to_tools_dict = pickle.load(f)
44
+ except EOFError:
45
+ print("Cache file is corrupted. Recreating tools.")
46
+ paper_to_tools_dict = {}
47
+ else:
48
+ print("Cache file does not exist or is empty. Recreating tools.")
49
+
50
+ # Create tools for each PDF if not loaded from cache
51
+ if not paper_to_tools_dict:
52
+ for pdf in pdf_files:
53
+ print(f"Getting tools for paper: {pdf}")
54
+ vector_tool, summary_tool = get_doc_tools(pdf, Path(pdf).stem)
55
+ paper_to_tools_dict[pdf] = [vector_tool, summary_tool]
56
+
57
+ # Save tools to cache
58
+ with open(tools_cache_path, 'wb') as f:
59
+ pickle.dump(paper_to_tools_dict, f)
60
+
61
+
62
+ # Combine all tools into a single list
63
+ all_tools = [t for pdf in pdf_files for t in paper_to_tools_dict[pdf]]
64
+
65
+ # Define an object index and retriever over these tools
66
+ from llama_index.core import VectorStoreIndex
67
+ from llama_index.core.objects import ObjectIndex
68
+
69
+ obj_index = ObjectIndex.from_objects(
70
+ all_tools,
71
+ index_cls=VectorStoreIndex,
72
+ )
73
+
74
+ obj_retriever = obj_index.as_retriever(similarity_top_k=3)
75
+
76
+ # Initialize the OpenAI LLM
77
+ from llama_index.llms.openai import OpenAI
78
+ llm = OpenAI(model="gpt-3.5-turbo")
79
+
80
+ # Set up the agent
81
+ from llama_index.core.agent import FunctionCallingAgentWorker
82
+ from llama_index.core.agent import AgentRunner
83
+
84
+ agent_worker = FunctionCallingAgentWorker.from_tools(
85
+ tool_retriever=obj_retriever,
86
+ llm=llm,
87
+ verbose=True
88
+ )
89
+ agent = AgentRunner(agent_worker)
90
+
91
+ # Define the function to query the agent
92
+ def ask_agent(question):
93
+ response = agent.query(question)
94
+ return str(response)
95
+
96
+ # Create the Gradio interface
97
+ iface = gr.Interface(
98
+ fn=ask_agent,
99
+ inputs="text",
100
+ outputs="text",
101
+ title="EDS Research Agent",
102
+ )
103
+
104
+ # Launch the Gradio app
105
+ iface.launch(share=True)
106
+
107
+ """
108
+ import streamlit as st
109
+ from transformers import pipeline
110
+
111
+ # Load your model
112
+ generator = pipeline('text-generation', model='gpt-3.5-turbo')
113
+
114
+ # Streamlit interface
115
+ st.title("Text Generator")
116
+ prompt = st.text_input("Enter your prompt:")
117
+ if st.button("Generate"):
118
+ result = generator(prompt, max_length=50)
119
+ st.write(result[0]['generated_text'])
120
+
121
+ """
122
+
123
+ """
124
+ import gradio as gr
125
+ from transformers import pipeline
126
+
127
+ # Load your model
128
+ generator = pipeline('text-generation', model='gpt-3.5-turbo')
129
+
130
+ # Define the function to generate text
131
+ def generate_text(prompt):
132
+ result = generator(prompt, max_length=50)
133
+ return result[0]['generated_text']
134
+
135
+ # Create the Gradio interface
136
+ iface = gr.Interface(fn=generate_text, inputs="text", outputs="text", title="Text Generator")
137
+
138
+ # Launch the interface
139
+ iface.launch()
140
+ """
141
+ """
142
+ import torch
143
+ print(torch.__version__)
144
+ """
helper.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Add your utilities or helper functions to this file.
2
+
3
+ import os
4
+ from dotenv import load_dotenv, find_dotenv
5
+
6
+ # these expect to find a .env file at the directory above the lesson. # the format for that file is (without the comment) #API_KEYNAME=AStringThatIsTheLongAPIKeyFromSomeService
7
+ def load_env():
8
+ _ = load_dotenv(find_dotenv())
9
+
10
+ def get_openai_api_key():
11
+ load_env()
12
+ openai_api_key = os.getenv("OPENAI_API_KEY")
13
+ return openai_api_key
rag.ipynb ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 3,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "#import OpenAI key with helper function\n",
10
+ "from helper import get_openai_api_key\n",
11
+ "\n",
12
+ "OPENAI_API_KEY = get_openai_api_key()"
13
+ ]
14
+ },
15
+ {
16
+ "cell_type": "code",
17
+ "execution_count": 4,
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "#A lot of modules use async and we want them to be compatible with Jupyter notebook\n",
22
+ "import nest_asyncio\n",
23
+ "\n",
24
+ "nest_asyncio.apply()"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 5,
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stdout",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "['Ehlers-Danlos-1\\\\2024_EDS_1.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_2.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_3.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_4.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_5.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_6.pdf', 'Ehlers-Danlos-1\\\\2024_EDS_7.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_1.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_2.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_3.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_4.pdf', 'Ehlers-Danlos-1\\\\Unknown_EDS_5.pdf']\n",
37
+ "['2024_EDS_1.pdf', '2024_EDS_2.pdf', '2024_EDS_3.pdf', '2024_EDS_4.pdf', '2024_EDS_5.pdf', '2024_EDS_6.pdf', '2024_EDS_7.pdf', 'Unknown_EDS_1.pdf', 'Unknown_EDS_2.pdf', 'Unknown_EDS_3.pdf', 'Unknown_EDS_4.pdf', 'Unknown_EDS_5.pdf']\n"
38
+ ]
39
+ }
40
+ ],
41
+ "source": [
42
+ "import os\n",
43
+ "import glob\n",
44
+ "\n",
45
+ "# Define the path to the directory containing the PDF files\n",
46
+ "folder_path = 'Ehlers-Danlos-1'\n",
47
+ "\n",
48
+ "# Get the list of all PDF files in the directory\n",
49
+ "pdf_files = glob.glob(os.path.join(folder_path, '*.pdf'))\n",
50
+ "print(pdf_files)\n",
51
+ "\n",
52
+ "# Extract just the filenames (optional)\n",
53
+ "pdf_filenames = [os.path.basename(pdf) for pdf in pdf_files]\n",
54
+ "\n",
55
+ "# Print the list of PDF filenames\n",
56
+ "print(pdf_filenames)\n"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 6,
62
+ "metadata": {},
63
+ "outputs": [
64
+ {
65
+ "name": "stdout",
66
+ "output_type": "stream",
67
+ "text": [
68
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_1.pdf\n",
69
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_2.pdf\n",
70
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_3.pdf\n",
71
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_4.pdf\n",
72
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_5.pdf\n",
73
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_6.pdf\n",
74
+ "Getting tools for paper: Ehlers-Danlos-1\\2024_EDS_7.pdf\n",
75
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_1.pdf\n",
76
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_2.pdf\n",
77
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_3.pdf\n",
78
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_4.pdf\n",
79
+ "Getting tools for paper: Ehlers-Danlos-1\\Unknown_EDS_5.pdf\n"
80
+ ]
81
+ }
82
+ ],
83
+ "source": [
84
+ "from utils import get_doc_tools\n",
85
+ "from pathlib import Path\n",
86
+ "\n",
87
+ "# Ensure function names are within the allowed length limit\n",
88
+ "def truncate_function_name(name, max_length=64):\n",
89
+ " return name if len(name) <= max_length else name[:max_length]\n",
90
+ "\n",
91
+ "paper_to_tools_dict = {}\n",
92
+ "for pdf in pdf_files:\n",
93
+ " print(f\"Getting tools for paper: {pdf}\")\n",
94
+ " vector_tool, summary_tool = get_doc_tools(pdf, Path(pdf).stem)\n",
95
+ " #vector_tool, summary_tool = get_doc_tools(pdf, truncate_function_name(Path(pdf).stem))\n",
96
+ " paper_to_tools_dict[pdf] = [vector_tool, summary_tool]\n",
97
+ " #print(vector_tool)\n",
98
+ " #print(summary_tool)"
99
+ ]
100
+ },
101
+ {
102
+ "cell_type": "code",
103
+ "execution_count": 7,
104
+ "metadata": {},
105
+ "outputs": [],
106
+ "source": [
107
+ "all_tools = [t for pdf in pdf_files for t in paper_to_tools_dict[pdf]]\n",
108
+ "#all_tools = [truncate_function_name(tool) for tool in all_tools]\n"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 8,
114
+ "metadata": {},
115
+ "outputs": [],
116
+ "source": [
117
+ "# define an \"object\" index and retriever over these tools\n",
118
+ "from llama_index.core import VectorStoreIndex\n",
119
+ "from llama_index.core.objects import ObjectIndex\n",
120
+ "\n",
121
+ "obj_index = ObjectIndex.from_objects(\n",
122
+ " all_tools,\n",
123
+ " index_cls=VectorStoreIndex,\n",
124
+ ")"
125
+ ]
126
+ },
127
+ {
128
+ "cell_type": "code",
129
+ "execution_count": 9,
130
+ "metadata": {},
131
+ "outputs": [],
132
+ "source": [
133
+ "obj_retriever = obj_index.as_retriever(similarity_top_k=3)\n"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": 10,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "from llama_index.llms.openai import OpenAI\n",
143
+ "\n",
144
+ "llm = OpenAI(model=\"gpt-3.5-turbo\")"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 11,
150
+ "metadata": {},
151
+ "outputs": [],
152
+ "source": [
153
+ "from llama_index.core.agent import FunctionCallingAgentWorker\n",
154
+ "from llama_index.core.agent import AgentRunner\n",
155
+ "\n",
156
+ "agent_worker = FunctionCallingAgentWorker.from_tools(\n",
157
+ " tool_retriever=obj_retriever,\n",
158
+ " llm=llm, \n",
159
+ " verbose=True\n",
160
+ ")\n",
161
+ "agent = AgentRunner(agent_worker)"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "code",
166
+ "execution_count": 1,
167
+ "metadata": {},
168
+ "outputs": [
169
+ {
170
+ "ename": "NameError",
171
+ "evalue": "name 'agent' is not defined",
172
+ "output_type": "error",
173
+ "traceback": [
174
+ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
175
+ "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)",
176
+ "Cell \u001b[1;32mIn[1], line 1\u001b[0m\n\u001b[1;32m----> 1\u001b[0m response \u001b[38;5;241m=\u001b[39m \u001b[43magent\u001b[49m\u001b[38;5;241m.\u001b[39mquery(\n\u001b[0;32m 2\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mDo people with EDS suffer from dislocations, and if so, how do they manifest?\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[0;32m 3\u001b[0m )\n\u001b[0;32m 4\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;28mstr\u001b[39m(response))\n",
177
+ "\u001b[1;31mNameError\u001b[0m: name 'agent' is not defined"
178
+ ]
179
+ },
180
+ {
181
+ "ename": "",
182
+ "evalue": "",
183
+ "output_type": "error",
184
+ "traceback": [
185
+ "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n",
186
+ "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n",
187
+ "\u001b[1;31mClick <a href='https://aka.ms/vscodeJupyterKernelCrash'>here</a> for more info. \n",
188
+ "\u001b[1;31mView Jupyter <a href='command:jupyter.viewOutput'>log</a> for further details."
189
+ ]
190
+ }
191
+ ],
192
+ "source": [
193
+ "\n",
194
+ "response = agent.query(\n",
195
+ " \"Do people with EDS suffer from dislocations, and if so, how do they manifest?\"\n",
196
+ ")\n",
197
+ "print(str(response))"
198
+ ]
199
+ }
200
+ ],
201
+ "metadata": {
202
+ "kernelspec": {
203
+ "display_name": "Python 3 (ipykernel)",
204
+ "language": "python",
205
+ "name": "python3"
206
+ },
207
+ "language_info": {
208
+ "codemirror_mode": {
209
+ "name": "ipython",
210
+ "version": 3
211
+ },
212
+ "file_extension": ".py",
213
+ "mimetype": "text/x-python",
214
+ "name": "python",
215
+ "nbconvert_exporter": "python",
216
+ "pygments_lexer": "ipython3",
217
+ "version": "3.12.3"
218
+ }
219
+ },
220
+ "nbformat": 4,
221
+ "nbformat_minor": 2
222
+ }
ragas_eval.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from helper import get_openai_api_key
4
+
5
+ venv_path = os.path.join(os.path.dirname(__file__), 'venv', 'Lib', 'python3.12', 'site-packages')
6
+ sys.path.append(venv_path)
7
+
8
+ os.environ["OPENAI_API_KEY"] = get_openai_api_key()
9
+
10
+ from langchain_community.document_loaders import DirectoryLoader
11
+ loader = DirectoryLoader("Ehlers-Danlos-1")
12
+ documents = loader.load()
13
+
14
+ for document in documents:
15
+ document.metadata['filename'] = document.metadata['source']
16
+
17
+ from ragas.testset.generator import TestsetGenerator
18
+ from ragas.testset.evolutions import simple, reasoning, multi_context
19
+ from langchain_openai import ChatOpenAI, OpenAIEmbeddings
20
+
21
+ # generator with openai models
22
+ generator_llm = ChatOpenAI(model="gpt-3.5-turbo")
23
+ critic_llm = ChatOpenAI(model="gpt-4")
24
+ embeddings = OpenAIEmbeddings()
25
+
26
+ generator = TestsetGenerator.from_langchain(
27
+ generator_llm,
28
+ critic_llm,
29
+ embeddings
30
+ )
31
+
32
+ # generate testset
33
+ testset = generator.generate_with_langchain_docs(documents, test_size=10, distributions={simple: 0.5, reasoning: 0.25, multi_context: 0.25})
34
+ print("!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
35
+ print(testset)
36
+ testset.to_pandas()
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # requirements file
2
+ # note which revision of python, for example 3.9.6
3
+ # in this file, insert all the pip install needs, include revision
4
+
5
+
6
+ python-dotenv==1.0.0
7
+
8
+ llama-index==0.10.27
9
+ llama-index-llms-openai==0.1.15
10
+ llama-index-embeddings-openai==0.1.7
11
+
12
+ gradio
13
+ transformers
14
+ torch>=1.8.0
test.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import os
3
+
4
+ # Add the virtual environment's site-packages to sys.path
5
+ # Replace 'pythonX.Y' with your Python version, e.g., 'python3.8'
6
+ venv_path = os.path.join(os.path.dirname(__file__), 'venv', 'lib', 'site-packages')
7
+ sys.path.append(venv_path)
8
+
9
+ # Ensure the directory structure is recognized as a package
10
+ # You can verify by listing the contents of the directory
11
+ print("sys.path:", sys.path)
12
+ print("Contents of venv_path:", os.listdir(venv_path))
13
+
14
+ # Now import the TestsetGenerator
15
+ try:
16
+ from ragas.testset.generator import TestsetGenerator
17
+ print("Successfully imported TestsetGenerator.")
18
+ except ImportError as e:
19
+ print("ImportError:", e)
20
+
21
+ # Use the imported function or class
22
+ try:
23
+ generator = TestsetGenerator()
24
+ print("Successfully created a TestsetGenerator instance.")
25
+ except Exception as e:
26
+ print("Error creating TestsetGenerator instance:", e)
tools_cache.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd5c05da1d3423251cb117b519c5f46662199b6c15f4db4591e226f70a584d6
3
+ size 8897145
utils.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from llama_index.core import SimpleDirectoryReader
2
+ from llama_index.core.node_parser import SentenceSplitter
3
+ from llama_index.core import Settings
4
+ from llama_index.llms.openai import OpenAI
5
+ from llama_index.embeddings.openai import OpenAIEmbedding
6
+ from llama_index.core import SummaryIndex, VectorStoreIndex
7
+ from llama_index.core.tools import QueryEngineTool
8
+ from llama_index.core.query_engine.router_query_engine import RouterQueryEngine
9
+ from llama_index.core.selectors import LLMSingleSelector
10
+
11
+ from llama_index.core import SimpleDirectoryReader, VectorStoreIndex, SummaryIndex
12
+ from llama_index.core.node_parser import SentenceSplitter
13
+ from llama_index.core.tools import FunctionTool, QueryEngineTool
14
+ from llama_index.core.vector_stores import MetadataFilters, FilterCondition
15
+ from typing import List, Optional
16
+
17
+
18
+
19
+ def get_doc_tools(
20
+ file_path: str,
21
+ name: str,
22
+ ) -> str:
23
+ """Get vector query and summary query tools from a document."""
24
+
25
+ # load documents
26
+ documents = SimpleDirectoryReader(input_files=[file_path]).load_data()
27
+ splitter = SentenceSplitter(chunk_size=1024)
28
+ nodes = splitter.get_nodes_from_documents(documents)
29
+ vector_index = VectorStoreIndex(nodes)
30
+
31
+ def vector_query(
32
+ query: str,
33
+ page_numbers: Optional[List[str]] = None
34
+ ) -> str:
35
+ """Use to answer questions over a given paper.
36
+
37
+ Useful if you have specific questions over the paper.
38
+ Always leave page_numbers as None UNLESS there is a specific page you want to search for.
39
+
40
+ Args:
41
+ query (str): the string query to be embedded.
42
+ page_numbers (Optional[List[str]]): Filter by set of pages. Leave as NONE
43
+ if we want to perform a vector search
44
+ over all pages. Otherwise, filter by the set of specified pages.
45
+
46
+ """
47
+
48
+ page_numbers = page_numbers or []
49
+ metadata_dicts = [
50
+ {"key": "page_label", "value": p} for p in page_numbers
51
+ ]
52
+
53
+ query_engine = vector_index.as_query_engine(
54
+ similarity_top_k=2,
55
+ filters=MetadataFilters.from_dicts(
56
+ metadata_dicts,
57
+ condition=FilterCondition.OR
58
+ )
59
+ )
60
+ response = query_engine.query(query)
61
+ return response
62
+
63
+
64
+ vector_query_tool = FunctionTool.from_defaults(
65
+ name=f"vector_tool_{name}",
66
+ fn=vector_query
67
+ )
68
+
69
+ summary_index = SummaryIndex(nodes)
70
+ summary_query_engine = summary_index.as_query_engine(
71
+ response_mode="tree_summarize",
72
+ use_async=True,
73
+ )
74
+ summary_tool = QueryEngineTool.from_defaults(
75
+ name=f"summary_tool_{name}",
76
+ query_engine=summary_query_engine,
77
+ description=(
78
+ f"Useful for summarization questions related to {name}"
79
+ ),
80
+ )
81
+
82
+ return vector_query_tool, summary_tool