File size: 4,286 Bytes
7ab5cc9
 
 
 
 
549b455
7b84d22
7ab5cc9
4525d51
0368e08
4e43404
4525d51
0368e08
4e43404
e8cda75
 
0368e08
e8cda75
 
 
 
0368e08
e8cda75
7ab5cc9
 
 
 
 
 
 
549b455
0368e08
7ab5cc9
 
 
 
 
 
 
 
e9c92d6
 
 
 
0368e08
7ab5cc9
e9c92d6
7ab5cc9
 
 
 
 
4525d51
0368e08
7ab5cc9
4525d51
0368e08
7ab5cc9
 
 
 
4525d51
0368e08
7ab5cc9
 
4525d51
0368e08
7ab5cc9
 
 
 
955cdd2
7ab5cc9
 
 
 
 
4525d51
0368e08
549b455
7ab5cc9
08e5eff
62c86fa
7ab5cc9
 
 
 
 
5a55441
 
 
 
7ab5cc9
 
 
 
5a55441
 
dc171bc
7ab5cc9
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import gradio as gr
from sentence_transformers import SentenceTransformer
import pandas as pd
import pickle
from pathlib import Path  
import time
from datetime import datetime

print("load model start")
print(datetime.fromtimestamp(time.time()))
model = SentenceTransformer('intfloat/multilingual-e5-large-instruct')
print("load model end")
print(datetime.fromtimestamp(time.time()))

quran = pd.read_csv('quran-eng.csv', delimiter=",")
print("load quran eng")
print(datetime.fromtimestamp(time.time()))

file = open('encoded_quran_text_split_multilingual-e5-large-instructs.sav','rb')
document_embeddings = pickle.load(file)
print("load quran embedding")
print(datetime.fromtimestamp(time.time()))

def make_clickable_both(val): 
    name, url = val.split('#')
    print(name+"\n")
    print(url+"\n")
    return f'<a href="{url}">{name}</a>'

def find(query):
    print("start")
    print(datetime.fromtimestamp(time.time()))
    def get_detailed_instruct(task_description: str, query: str) -> str:
        return f'Instruct: {task_description}\nQuery: {query}'
    
    # Each query must come with a one-sentence instruction that describes the task
    task = 'Given a web search query, retrieve relevant passages that answer the query'
    queries = [
        get_detailed_instruct(task, query)
    ]

    #file = open('quran-splitted.sav','rb')
    #quran_splitted = pickle.load(file)
    #print("load quran\n")
    #print(datetime.fromtimestamp(time.time()))
    
    #documents = quran_splitted['text'].tolist()
    # document_embeddings = model.encode(documents, convert_to_tensor=True, normalize_embeddings=True)
    # filename = 'encoded_quran_text_split_multilingual-e5-large-instruct.sav'
    # pickle.dump(embeddings, open(filename, 'wb'))
    
    query_embeddings = model.encode(queries, convert_to_tensor=True, normalize_embeddings=True)
    print("embed query")
    print(datetime.fromtimestamp(time.time()))
    scores = (query_embeddings @ document_embeddings.T) * 100
    print("count similarities")
    print(datetime.fromtimestamp(time.time()))

    # insert the similarity value to dataframe & sort it
    file = open('quran-splitted.sav','rb')
    quran_splitted = pickle.load(file)
    print("load quran")
    print(datetime.fromtimestamp(time.time()))
    quran_splitted['similarity'] = scores.tolist()[0]
    sorted_quran = quran_splitted.sort_values(by='similarity', ascending=False)
    print("sort by similarity")
    print(datetime.fromtimestamp(time.time()))
    
    #results = ""
    results = pd.DataFrame()
    i = 0
    while i<3:
        result = sorted_quran.iloc[i]
        result_quran = quran.loc[(quran['sura']==result['sura']) & (quran['aya']==result['aya'])]
        results = pd.concat([results, result_quran])
        #results = results + result_quran['text'].item()+" (Q.S "+str(result['sura']).rstrip('.0')+":"+str(result['aya']).rstrip('.0')+")\n"
        i=i+1
    print("collect results")
    print(datetime.fromtimestamp(time.time()))
    
    url = 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir'
    results['text'] = '<a href="'+url+'">'+results['text']+ '</a>' + ' (QS. ' + results['sura'].astype(str) + ':' + results['aya'].astype(str) + ')'
    results = results.drop(columns=['sura', 'aya'])
    #results['text'] = results['text'] + '#' + 'https://quran.com/'+results['sura'].astype(str)+':'+results['aya'].astype(str)+'/tafsirs/en-tafisr-ibn-kathir'
    
    #results = results.style.format({'text': make_clickable_both})
    
    #return sorted_quran
    #filepath = Path(query+'.csv')  
    #results.to_csv(filepath,index=False)  
    #return results, filepath
    return results
    
demo = gr.Interface(
    fn=find, 
    inputs="textbox", 
    #outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True),gr.DownloadButton()],  
    outputs=[gr.Dataframe(headers=['text'],datatype=["markdown"],wrap=True)],  
    cache_examples="lazy",
    examples=[
                ["law of inheritance in islam"],
                ["tunjukilah jalan yang lurus"],
                ["سليمان"],
            ],
    title="Quran Finder")
#demo = gr.Interface(fn=find, inputs="textbox", outputs="textbox")
    
if __name__ == "__main__":
    demo.launch()