yvanai / app.py
nelsonmaligro's picture
Upload app.py
7d4ffdc verified
raw
history blame
19 kB
from langchain_community.llms import Ollama
from langchain_huggingface import HuggingFacePipeline
from langchain.chains import RetrievalQA
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain.chains import create_retrieval_chain, SequentialChain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.question_answering import load_qa_chain
from langchain_core.prompts import ChatPromptTemplate
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.callbacks.base import BaseCallbackHandler
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks import StdOutCallbackHandler
#from langchain_openai import ChatOpenAI
from transformers import pipeline, TextIteratorStreamer, AutoTokenizer, AutoModel, AutoModelForCausalLM
from langchain_text_splitters import RecursiveCharacterTextSplitter
from huggingface_hub import login
from threading import Thread
from typing import Any, List
import ast, torch
from http import cookies
from langchain_chroma import Chroma
import json
from queue import Queue, Empty
from scapy.all import rdpcap
import chromadb
from chromadb.utils import embedding_functions
from chromadb.utils.embedding_functions import create_langchain_embedding
import gradio as gr
import random, time, pymupdf
from PIL import Image, ImageDraw, ImageFont
from pathlib import Path
import requests
from chromadb.config import Settings
from chromadb.api.shared_system_client import SharedSystemClient
from tqdm import tqdm
from langchain_community.document_loaders import (
CSVLoader,
TextLoader,
UnstructuredPowerPointLoader,
PyMuPDFLoader,
UnstructuredWordDocumentLoader,
)
import os, sys, getopt, glob, shutil, string, platform, gc, re, asyncio
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import itertools
from pcap import pcapanalyze
############Initialize global variables ##############
chat_history = []
count = 0
nrThread = 100
q = Queue()
job_done = object()
#globalChain = None
PERSIST_DIR = 'db2/'
js = '''function js(){window.set_cookie = function(key, value){document.cookie = key+'='+value+'; Path=/; SameSite=Strict';return [value]}}'''
CHROMA_SETTINGS = Settings(
persist_directory=PERSIST_DIR,
anonymized_telemetry=False,
)
model_name = 'BAAI/bge-large-en-v1.5'
model_kwargs = {'device': 'cpu'}
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
#embeddings = OllamaEmbeddings(model="mxbai-embed-large")
LOADER_MAPPING = {
".csv": (CSVLoader, {}),
".txt": (TextLoader, {"encoding": "utf8"}),
".ppt": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}),
".pptx": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}),
".doc": (UnstructuredWordDocumentLoader, {}),
".docx": (UnstructuredWordDocumentLoader, {}),
".pdf": (PyMuPDFLoader, {}),
".pdfo": (PyMuPDFLoader, {'extract_images':True}),
}
system_prompt = (
"Use the given context to answer the question. "
"If the question is not related to the context, then use your general knowledge to answer the question. "
"If you don't know the answer, just say that you don't know, don't try to make up an answer. "
"Use three sentences maximum and keep the answer concise. "
"Your name is YVAN and you are an AI specializing information related to the C4ISR of the Philippine Navy. "
"Context: {context}"
)
user_prompt = ChatPromptTemplate.from_messages(
[
("system", system_prompt),
("human", "{input}"),
]
)
##################for File Ingestion
def remove_ansi_escape_sequences(input_string):
# Define a regular expression pattern to match ANSI escape sequences
ansi_escape_pattern = r'\x1B(?:[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]'
# Use re.sub() to replace ANSI escape sequences with an empty string
cleaned_string = re.sub(ansi_escape_pattern, '', input_string)
cleaned_string = cleaned_string.encode("ascii", errors='ignore')
cleaned_string = cleaned_string.decode('utf-8', errors='ignore')
return cleaned_string
def load_single_document(file_path):
ext = "." + file_path.rsplit(".", 1)[-1]
#print(ext)
if ext in LOADER_MAPPING or ext == '.pcap':
if (ext == '.pcap'): #check if pcap file
pcapana=pcapanalyze(file_path)
data, http_data = pcapana.parse_pcap()
myRet = pcapana.prepare_documents(data, http_data)
with open('files/'+os.path.basename(file_path) + '.txt' ,mode = 'w') as file:
for i, doc in enumerate(myRet):
file.write(str(doc) + '\n')
else:
loader_class, loader_args = LOADER_MAPPING[ext]
loader = loader_class(file_path, **loader_args)
myRet = loader.load()
if (ext=='.pdf'): #check OCR in all PDF
disStr = ''
for doc in myRet:
disStr += doc.page_content
if (len(disStr)< 100): #if not text possible OCR
shutil.copyfile(file_path, 'files/'+os.path.basename(file_path)+'o')
loader_class, loader_args = LOADER_MAPPING[ext+'o']
loader = loader_class('files/'+os.path.basename(file_path)+'o', **loader_args)
myRet = loader.load()
#print(myRet)
return myRet
raise ValueError(f"Unsupported file extension '{ext}'")
def split_docs(docs, size):
for i in range(0, len(docs), size):
yield docs[i:i + size]
def runembed(dbread, docs, i):
#print(i)
#print(docs)
dbread.add_documents([docs])
def remove_ws(d):
text = d.page_content.strip()
text = text.encode("ascii", errors='ignore')
text = text.decode('utf-8', errors='ignore')
d.page_content = text
d.metadata = ''
return d
def filtercollname(collname):
collname = collname.replace('-','')
collname = collname.replace('(','')
collname = collname.replace(')','')
collname = collname.replace('.','')
collname = collname.replace('_','')
collname = collname.replace(' ','')
return collname
def processDocs(file):
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
documents = load_single_document(file)
docs = text_splitter.split_documents(documents)
docs = [remove_ws(d) for d in docs]
totLen = len(docs)
#splitsize = 1 #len(docs)
#docs = split_docs(docs, splitsize)
return docs, totLen
def iterateDocs(file, progress, numThr):
arrProc = []
countthread = 0
disLen = 0
docs_chunked, totLen = processDocs(file)
#print(numThr)
collname = os.path.basename(file)
collname = filtercollname(collname)
#print(collname)
dbread = Chroma(persist_directory=PERSIST_DIR)
dbread._client.get_or_create_collection(name="collname")
#dbread._client.delete_collection(name=collname)
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:")
for i, doc in enumerate(docs_chunked):
if (i==0):
#print(doc)
dbread = Chroma.from_documents(collection_name="collname", documents=[doc],embedding=embeddings,persist_directory=PERSIST_DIR)
progress((i+1)/totLen)
else:
proc = Thread(target = runembed, args=(dbread, doc, i,))
arrProc.append(proc)
proc.start()
countthread +=1
if (countthread >= numThr):
for z, proc in enumerate(arrProc):
proc.join()
progress((i+1)/totLen)
arrProc = []
countthread = 0
disLen = i
for i, proc in enumerate(arrProc):
proc.join()
progress((disLen+1)/totLen)
disLen += 1
return docs_chunked
###################End file ingestion
###############################For AI Query
class QueueCallback(BaseCallbackHandler):
"""Callback handler for streaming LLM responses to a queue."""
def __init__(self, q):
self.q = q
def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
self.q.put(token)
def on_llm_end(self, *args, **kwargs: Any) -> None:
return self.q.empty()
def save_string_to_image(text, filename, font_path=None, font_size=40):
# Determine the size of the image
lines = text.split('\n')
max_width = max(len(line) for line in lines)
width = max_width * font_size // 20
height = len(lines) * font_size * 2
image = Image.new('RGB', (width, height), color=(255, 255, 255))
draw = ImageDraw.Draw(image)
font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
y_text = 10
for line in lines:
draw.text((2, y_text), line, font=font, fill=(0, 0, 0))
y_text += font_size * 1.5 # Move down by font size for the next line
image.save(filename)
def answer(question, chain):
def task():
response = chain.invoke({"input": question})
#response = chain.invoke({'input_documents':[doc], 'input':question})
q.put(job_done)
t = Thread(target=task)
t.start()
def embedDocs(myPERSIST_DIR, collname ):
collname = filtercollname(collname)
#print(collname)
global dbread, retriever
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:")
dbread = Chroma(collection_name="collname", embedding_function=embeddings, persist_directory=myPERSIST_DIR)
retriever = dbread.as_retriever(search_kwargs={"k": 4})
return retriever, dbread
def analyzeAI(modelname):
callbacks = [QueueCallback(q)]
llm = Ollama(model=modelname, callbacks=callbacks)
return llm
def prompt(llm, retriever):
question_answer_chain = create_stuff_documents_chain(llm, user_prompt)
chain = create_retrieval_chain(retriever, question_answer_chain)
return chain
##################end AI query
################gradio functions
def user(user_message, history, convo):
convo.append(['user',user_message])
return "", history + [[user_message, None]], convo
def bots(history, convo, anser):
# bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
# time.sleep(2)
# history[-1][1] = ""
# for character in bot_message:
# print(character)
# history[-1][1] += character
# time.sleep(0.05)
# yield character
#print(history)
question = history[-1][0]
history[-1][1] = ""
try:
answer(question, globalChain)
except:
yield [[question,'AI not responding. Please reload.']], convo, "Please reload", convo
return
while True:
try:
next_token = q.get(True, timeout=1)
if next_token is job_done:
convo.append(['assistant', anser])
yield history, convo, anser, convo
break
history[-1][1] += next_token
anser += next_token
yield history, convo, anser, convo
except Empty:
continue
def purge_chat_and_render_first(file, chatbot, numThr, myFiles, origFile, progress=gr.Progress()):
progress(0)
myFiles = file.name
origFile = os.path.basename(file.name)
#Clear persistent DB
#system = platform.system()
#if system == "Windows":
#dbread.reset()
#dbread._system.stop()
#gc.collect()
#try:
# os.remove('db2/'+os.path.basename(myFiles))
# print("chroma.sqlite file deleted successfully.")
#except OSError as e:
# print(f"Error deleting chroma.sqlite: {e}")
#load image
image = None
docs_chunked = iterateDocs(file, progress, numThr)
ext = "." + os.path.basename(file).rsplit(".", 1)[-1]
if ext != '.pdf':
myFiles = Path(os.getcwd()+'/files/' + os.path.basename(file.name) +'.pdf')
if os.path.exists('files/'+os.path.basename(file.name) + '.txt'):
with open('files/'+os.path.basename(file.name) + '.txt' ,mode = 'r') as file:
disStr = file.read(2000)
save_string_to_image(disStr, myFiles, font_size=30)
else:
#print(myDocs)
save_string_to_image(str(docs_chunked[0].page_content), myFiles, font_size=30)
doc = pymupdf.open(Path(myFiles))
else: doc = pymupdf.open(Path(file.name))
#print(myFiles)
page = doc[0]
pix = page.get_pixmap(dpi=150)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
return chatbot, numThr, myFiles, image, origFile
css = """
.gradio-container {background-color:#BBD3FB}
#rowTitle {text-align: center;}
#cssTitle {text-align: center; color: #FFFFFF; background:#3B69B7; font-size: 30px; padding-top: 4px; padding-bottom: 4px;}
#cssBut {width:100px; height: 60px;}
#cssText {width:100px;}
"""
#######################end gradio functions
def get_config(chatbot, convo, request: gr.Request):
config = {'filepath':'none','convo':[], 'origFile':'none'}
for key in config:
if key in request.cookies:
config[key] = request.cookies[key]
#print(config['newpath'])
image = None
myFile = config['filepath']
origFile = config['origFile']
if config['filepath'] is not None:
if (config['filepath']!='none' and config['filepath']!='undefined'):
if os.path.exists(config['filepath']):
#print(config['filepath'])
doc = pymupdf.open(config['filepath'])
page = doc[0]
pix = page.get_pixmap(dpi=150)
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
chatbot= []
if ((config['convo'] != []) and (config['convo'] is not None) and (config['convo']!='') and (config['convo']!='undefined')):
#listconvo = config['convo'].split(',')
#listconvo = json.dumps(config['convo'], separators=(',', ':'))
#listconvo = json.loads(config['convo'])
listconvo = ast.literal_eval(config['convo'])
convo = listconvo
for conv in listconvo:
#print(conv)
if conv[0] == 'user': chatbot = chatbot + [[str(conv[1]), None]]
elif conv[0] == 'assistant': chatbot = chatbot + [[None, str(conv[1])]]
return image, myFile, chatbot, convo, origFile
def updateDIR(myFile):
disFile = os.path.basename(myFile)
#ext = "." + disFile.rsplit(".", 1)[-1]
#if ext == '.pdf':
# disFile = disFile[0:len(disFile)-4:1]
retriever, dbread = embedDocs('db2/', disFile)
#print('here')
llm = analyzeAI('llava-llama3')
global globalChain
globalChain = prompt (llm, retriever)
def updateConvo(history):
return history
def processChain(myFiles, origFile):
return myFiles, origFile
def clearbot():
return [],[]
with gr.Blocks(css=css, fill_width=True) as demo:
#PERSIST_DIR = 'db2/'+''.join(random.choices(string.ascii_uppercase + string.digits, k=7))+'/'
myFile = gr.State("none")
origFile = gr.State("none")
numThr = gr.State(100)
convo = gr.State([])
anser = gr.State('')
myDoc = gr.State([])
with gr.Row(elem_id="rowTitle"):
gr.Button(value="Welcome to PN Cyber AI (codename: YVAN)", elem_id="cssTitle")
with gr.Row():
with gr.Column(scale=8):
with gr.Row():
chatbot = gr.Chatbot()
#chatbot.change(updateConvo,[chatbot], js="(convo) => {set_cookie('convo', convo)}")
with gr.Row():
with gr.Column(scale=4, min_width=300):
msg = gr.Textbox(show_label=False,placeholder = "Ask me")
with gr.Column(scale=1, min_width=50):
send = gr.Button("Send", elem_id="cssBut")
with gr.Column(scale=1, min_width=50):
clear = gr.Button("Clear", elem_id="cssBut")
clear.click(clearbot, [], [chatbot, convo], queue=False, js="(convbox) => {set_cookie('convo', '')}")
with gr.Column(scale=4):
with gr.Row():
show_img = gr.Image(label="Upload Logs/pcap/Doc", sources=[("upload")], type="filepath")
with gr.Row():
btn = gr.UploadButton("📁 Upload Logs/pcap/Doc", file_types=[".pdf", ".csv", ".pcap", ".txt", ".docx", ".pptx"])
with gr.Row():
slider = gr.Slider(50, 1000, value=100, interactive=True, step=50,label="Thread", info="Choose Number of Threads to multitask")
with gr.Row(visible=False):
convobox = gr.Textbox()
txtFile = gr.Textbox()
origFile = gr.Textbox()
demo.load(fn=get_config, inputs=[chatbot, convo], outputs=[show_img, txtFile, chatbot, convo, origFile], js=js
).then(updateDIR,[origFile])
msg.submit(user, [msg, chatbot, convo], [msg, chatbot, convo], queue=False
).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox]
).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}")
send.click(user, [msg, chatbot, convo], [msg, chatbot, convo ], queue=False
).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox]
).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}")
btn.upload(
fn=purge_chat_and_render_first,
inputs=[btn, chatbot, numThr, myFile, origFile],
outputs=[chatbot, numThr, txtFile, show_img, origFile],
#js="(btn, chatbot, numThr, myFile) => {set_cookie('filepath', myFile);}",
).then(processChain,[txtFile, origFile], js="(mFile, origFile) => {set_cookie('filepath', mFile); set_cookie('origFile', origFile);}")
if __name__ == '__main__':
path = 'files'
if not os.path.exists(path):
os.makedirs(path)
path = 'db2'
if not os.path.exists(path):
os.makedirs(path)
demo.queue()
demo.launch()