Spaces:

nelsonmaligro
/

yvanai

Sleeping

App Files Files Community

yvanai / app.py

nelsonmaligro

Upload app.py

7d4ffdc verified 5 months ago

raw

history blame

19 kB

	from langchain_community.llms import Ollama
	from langchain_huggingface import HuggingFacePipeline
	from langchain.chains import RetrievalQA
	from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
	from langchain.chains import create_retrieval_chain, SequentialChain
	from langchain.chains.combine_documents import create_stuff_documents_chain
	from langchain.chains.question_answering import load_qa_chain
	from langchain_core.prompts import ChatPromptTemplate
	from langchain.prompts import PromptTemplate
	from langchain_community.embeddings import HuggingFaceBgeEmbeddings
	from langchain.callbacks.base import BaseCallbackHandler
	from langchain.callbacks.manager import CallbackManager
	from langchain.callbacks import StdOutCallbackHandler
	#from langchain_openai import ChatOpenAI
	from transformers import pipeline, TextIteratorStreamer, AutoTokenizer, AutoModel, AutoModelForCausalLM
	from langchain_text_splitters import RecursiveCharacterTextSplitter
	from huggingface_hub import login

	from threading import Thread
	from typing import Any, List
	import ast, torch
	from http import cookies
	from langchain_chroma import Chroma
	import json
	from queue import Queue, Empty
	from scapy.all import rdpcap
	import chromadb
	from chromadb.utils import embedding_functions
	from chromadb.utils.embedding_functions import create_langchain_embedding
	import gradio as gr
	import random, time, pymupdf
	from PIL import Image, ImageDraw, ImageFont
	from pathlib import Path
	import requests
	from chromadb.config import Settings
	from chromadb.api.shared_system_client import SharedSystemClient
	from tqdm import tqdm
	from langchain_community.document_loaders import (
	CSVLoader,
	TextLoader,
	UnstructuredPowerPointLoader,
	PyMuPDFLoader,
	UnstructuredWordDocumentLoader,
	)
	import os, sys, getopt, glob, shutil, string, platform, gc, re, asyncio
	from reportlab.lib.pagesizes import letter
	from reportlab.pdfgen import canvas
	import itertools
	from pcap import pcapanalyze

	############Initialize global variables ##############
	chat_history = []
	count = 0
	nrThread = 100
	q = Queue()
	job_done = object()
	#globalChain = None
	PERSIST_DIR = 'db2/'
	js = '''function js(){window.set_cookie = function(key, value){document.cookie = key+'='+value+'; Path=/; SameSite=Strict';return [value]}}'''

	CHROMA_SETTINGS = Settings(
	persist_directory=PERSIST_DIR,
	anonymized_telemetry=False,
	)
	model_name = 'BAAI/bge-large-en-v1.5'
	model_kwargs = {'device': 'cpu'}
	encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity
	#embeddings = OllamaEmbeddings(model="mxbai-embed-large")
	LOADER_MAPPING = {
	".csv": (CSVLoader, {}),
	".txt": (TextLoader, {"encoding": "utf8"}),
	".ppt": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}),
	".pptx": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}),
	".doc": (UnstructuredWordDocumentLoader, {}),
	".docx": (UnstructuredWordDocumentLoader, {}),
	".pdf": (PyMuPDFLoader, {}),
	".pdfo": (PyMuPDFLoader, {'extract_images':True}),
	}

	system_prompt = (
	"Use the given context to answer the question. "
	"If the question is not related to the context, then use your general knowledge to answer the question. "
	"If you don't know the answer, just say that you don't know, don't try to make up an answer. "
	"Use three sentences maximum and keep the answer concise. "
	"Your name is YVAN and you are an AI specializing information related to the C4ISR of the Philippine Navy. "
	"Context: {context}"
	)
	user_prompt = ChatPromptTemplate.from_messages(
	[
	("system", system_prompt),
	("human", "{input}"),
	]
	)

	##################for File Ingestion
	def remove_ansi_escape_sequences(input_string):
	# Define a regular expression pattern to match ANSI escape sequences
	ansi_escape_pattern = r'\x1B(?:[@-_]\|[\x80-\x9F])[0-?][ -/][@-~]'

	# Use re.sub() to replace ANSI escape sequences with an empty string
	cleaned_string = re.sub(ansi_escape_pattern, '', input_string)
	cleaned_string = cleaned_string.encode("ascii", errors='ignore')
	cleaned_string = cleaned_string.decode('utf-8', errors='ignore')
	return cleaned_string


	def load_single_document(file_path):
	ext = "." + file_path.rsplit(".", 1)[-1]
	#print(ext)
	if ext in LOADER_MAPPING or ext == '.pcap':

	if (ext == '.pcap'): #check if pcap file
	pcapana=pcapanalyze(file_path)
	data, http_data = pcapana.parse_pcap()
	myRet = pcapana.prepare_documents(data, http_data)
	with open('files/'+os.path.basename(file_path) + '.txt' ,mode = 'w') as file:
	for i, doc in enumerate(myRet):
	file.write(str(doc) + '\n')
	else:
	loader_class, loader_args = LOADER_MAPPING[ext]
	loader = loader_class(file_path, **loader_args)
	myRet = loader.load()

	if (ext=='.pdf'): #check OCR in all PDF
	disStr = ''
	for doc in myRet:
	disStr += doc.page_content
	if (len(disStr)< 100): #if not text possible OCR
	shutil.copyfile(file_path, 'files/'+os.path.basename(file_path)+'o')
	loader_class, loader_args = LOADER_MAPPING[ext+'o']
	loader = loader_class('files/'+os.path.basename(file_path)+'o', **loader_args)
	myRet = loader.load()
	#print(myRet)
	return myRet

	raise ValueError(f"Unsupported file extension '{ext}'")

	def split_docs(docs, size):
	for i in range(0, len(docs), size):
	yield docs[i:i + size]

	def runembed(dbread, docs, i):
	#print(i)
	#print(docs)
	dbread.add_documents([docs])



	def remove_ws(d):
	text = d.page_content.strip()
	text = text.encode("ascii", errors='ignore')
	text = text.decode('utf-8', errors='ignore')
	d.page_content = text
	d.metadata = ''
	return d
	def filtercollname(collname):
	collname = collname.replace('-','')
	collname = collname.replace('(','')
	collname = collname.replace(')','')
	collname = collname.replace('.','')
	collname = collname.replace('_','')
	collname = collname.replace(' ','')
	return collname
	def processDocs(file):
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
	documents = load_single_document(file)
	docs = text_splitter.split_documents(documents)
	docs = [remove_ws(d) for d in docs]
	totLen = len(docs)
	#splitsize = 1 #len(docs)
	#docs = split_docs(docs, splitsize)
	return docs, totLen

	def iterateDocs(file, progress, numThr):
	arrProc = []
	countthread = 0
	disLen = 0
	docs_chunked, totLen = processDocs(file)
	#print(numThr)
	collname = os.path.basename(file)
	collname = filtercollname(collname)
	#print(collname)
	dbread = Chroma(persist_directory=PERSIST_DIR)
	dbread._client.get_or_create_collection(name="collname")
	#dbread._client.delete_collection(name=collname)
	embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles：")
	for i, doc in enumerate(docs_chunked):
	if (i==0):
	#print(doc)
	dbread = Chroma.from_documents(collection_name="collname", documents=[doc],embedding=embeddings,persist_directory=PERSIST_DIR)
	progress((i+1)/totLen)
	else:
	proc = Thread(target = runembed, args=(dbread, doc, i,))
	arrProc.append(proc)
	proc.start()
	countthread +=1
	if (countthread >= numThr):
	for z, proc in enumerate(arrProc):
	proc.join()
	progress((i+1)/totLen)
	arrProc = []
	countthread = 0
	disLen = i

	for i, proc in enumerate(arrProc):
	proc.join()
	progress((disLen+1)/totLen)
	disLen += 1

	return docs_chunked


	###################End file ingestion




	###############################For AI Query


	class QueueCallback(BaseCallbackHandler):
	"""Callback handler for streaming LLM responses to a queue."""

	def __init__(self, q):
	self.q = q

	def on_llm_new_token(self, token: str, **kwargs: Any) -> None:
	self.q.put(token)

	def on_llm_end(self, args, *kwargs: Any) -> None:
	return self.q.empty()


	def save_string_to_image(text, filename, font_path=None, font_size=40):
	# Determine the size of the image
	lines = text.split('\n')
	max_width = max(len(line) for line in lines)
	width = max_width * font_size // 20
	height = len(lines) * font_size * 2
	image = Image.new('RGB', (width, height), color=(255, 255, 255))
	draw = ImageDraw.Draw(image)
	font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default()
	y_text = 10
	for line in lines:
	draw.text((2, y_text), line, font=font, fill=(0, 0, 0))
	y_text += font_size * 1.5 # Move down by font size for the next line
	image.save(filename)

	def answer(question, chain):
	def task():
	response = chain.invoke({"input": question})
	#response = chain.invoke({'input_documents':[doc], 'input':question})
	q.put(job_done)

	t = Thread(target=task)
	t.start()

	def embedDocs(myPERSIST_DIR, collname ):
	collname = filtercollname(collname)
	#print(collname)
	global dbread, retriever
	embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles：")
	dbread = Chroma(collection_name="collname", embedding_function=embeddings, persist_directory=myPERSIST_DIR)
	retriever = dbread.as_retriever(search_kwargs={"k": 4})
	return retriever, dbread


	def analyzeAI(modelname):
	callbacks = [QueueCallback(q)]
	llm = Ollama(model=modelname, callbacks=callbacks)
	return llm

	def prompt(llm, retriever):
	question_answer_chain = create_stuff_documents_chain(llm, user_prompt)
	chain = create_retrieval_chain(retriever, question_answer_chain)
	return chain

	##################end AI query

	################gradio functions


	def user(user_message, history, convo):
	convo.append(['user',user_message])
	return "", history + [[user_message, None]], convo

	def bots(history, convo, anser):
	# bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"])
	# time.sleep(2)
	# history[-1][1] = ""
	# for character in bot_message:
	# print(character)
	# history[-1][1] += character
	# time.sleep(0.05)
	# yield character
	#print(history)
	question = history[-1][0]
	history[-1][1] = ""
	try:
	answer(question, globalChain)
	except:
	yield [[question,'AI not responding. Please reload.']], convo, "Please reload", convo
	return

	while True:
	try:
	next_token = q.get(True, timeout=1)
	if next_token is job_done:
	convo.append(['assistant', anser])
	yield history, convo, anser, convo
	break
	history[-1][1] += next_token
	anser += next_token
	yield history, convo, anser, convo
	except Empty:
	continue

	def purge_chat_and_render_first(file, chatbot, numThr, myFiles, origFile, progress=gr.Progress()):
	progress(0)
	myFiles = file.name
	origFile = os.path.basename(file.name)
	#Clear persistent DB
	#system = platform.system()
	#if system == "Windows":
	#dbread.reset()
	#dbread._system.stop()
	#gc.collect()
	#try:
	# os.remove('db2/'+os.path.basename(myFiles))
	# print("chroma.sqlite file deleted successfully.")
	#except OSError as e:
	# print(f"Error deleting chroma.sqlite: {e}")

	#load image
	image = None
	docs_chunked = iterateDocs(file, progress, numThr)
	ext = "." + os.path.basename(file).rsplit(".", 1)[-1]
	if ext != '.pdf':
	myFiles = Path(os.getcwd()+'/files/' + os.path.basename(file.name) +'.pdf')
	if os.path.exists('files/'+os.path.basename(file.name) + '.txt'):
	with open('files/'+os.path.basename(file.name) + '.txt' ,mode = 'r') as file:
	disStr = file.read(2000)
	save_string_to_image(disStr, myFiles, font_size=30)
	else:
	#print(myDocs)
	save_string_to_image(str(docs_chunked[0].page_content), myFiles, font_size=30)
	doc = pymupdf.open(Path(myFiles))
	else: doc = pymupdf.open(Path(file.name))
	#print(myFiles)
	page = doc[0]
	pix = page.get_pixmap(dpi=150)
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)

	return chatbot, numThr, myFiles, image, origFile

	css = """
	.gradio-container {background-color:#BBD3FB}
	#rowTitle {text-align: center;}
	#cssTitle {text-align: center; color: #FFFFFF; background:#3B69B7; font-size: 30px; padding-top: 4px; padding-bottom: 4px;}

	#cssBut {width:100px; height: 60px;}
	#cssText {width:100px;}
	"""
	#######################end gradio functions

	def get_config(chatbot, convo, request: gr.Request):
	config = {'filepath':'none','convo':[], 'origFile':'none'}
	for key in config:
	if key in request.cookies:
	config[key] = request.cookies[key]
	#print(config['newpath'])
	image = None
	myFile = config['filepath']
	origFile = config['origFile']
	if config['filepath'] is not None:
	if (config['filepath']!='none' and config['filepath']!='undefined'):
	if os.path.exists(config['filepath']):
	#print(config['filepath'])
	doc = pymupdf.open(config['filepath'])
	page = doc[0]
	pix = page.get_pixmap(dpi=150)
	image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	chatbot= []
	if ((config['convo'] != []) and (config['convo'] is not None) and (config['convo']!='') and (config['convo']!='undefined')):
	#listconvo = config['convo'].split(',')
	#listconvo = json.dumps(config['convo'], separators=(',', ':'))
	#listconvo = json.loads(config['convo'])
	listconvo = ast.literal_eval(config['convo'])
	convo = listconvo
	for conv in listconvo:
	#print(conv)
	if conv[0] == 'user': chatbot = chatbot + [[str(conv[1]), None]]
	elif conv[0] == 'assistant': chatbot = chatbot + [[None, str(conv[1])]]

	return image, myFile, chatbot, convo, origFile

	def updateDIR(myFile):
	disFile = os.path.basename(myFile)
	#ext = "." + disFile.rsplit(".", 1)[-1]
	#if ext == '.pdf':
	# disFile = disFile[0:len(disFile)-4:1]
	retriever, dbread = embedDocs('db2/', disFile)
	#print('here')
	llm = analyzeAI('llava-llama3')
	global globalChain
	globalChain = prompt (llm, retriever)

	def updateConvo(history):
	return history

	def processChain(myFiles, origFile):
	return myFiles, origFile

	def clearbot():
	return [],[]

	with gr.Blocks(css=css, fill_width=True) as demo:
	#PERSIST_DIR = 'db2/'+''.join(random.choices(string.ascii_uppercase + string.digits, k=7))+'/'
	myFile = gr.State("none")
	origFile = gr.State("none")
	numThr = gr.State(100)
	convo = gr.State([])
	anser = gr.State('')
	myDoc = gr.State([])
	with gr.Row(elem_id="rowTitle"):
	gr.Button(value="Welcome to PN Cyber AI (codename: YVAN)", elem_id="cssTitle")
	with gr.Row():
	with gr.Column(scale=8):
	with gr.Row():
	chatbot = gr.Chatbot()
	#chatbot.change(updateConvo,[chatbot], js="(convo) => {set_cookie('convo', convo)}")
	with gr.Row():
	with gr.Column(scale=4, min_width=300):
	msg = gr.Textbox(show_label=False,placeholder = "Ask me")
	with gr.Column(scale=1, min_width=50):
	send = gr.Button("Send", elem_id="cssBut")
	with gr.Column(scale=1, min_width=50):
	clear = gr.Button("Clear", elem_id="cssBut")
	clear.click(clearbot, [], [chatbot, convo], queue=False, js="(convbox) => {set_cookie('convo', '')}")

	with gr.Column(scale=4):
	with gr.Row():
	show_img = gr.Image(label="Upload Logs/pcap/Doc", sources=[("upload")], type="filepath")
	with gr.Row():
	btn = gr.UploadButton("📁 Upload Logs/pcap/Doc", file_types=[".pdf", ".csv", ".pcap", ".txt", ".docx", ".pptx"])
	with gr.Row():
	slider = gr.Slider(50, 1000, value=100, interactive=True, step=50,label="Thread", info="Choose Number of Threads to multitask")
	with gr.Row(visible=False):
	convobox = gr.Textbox()
	txtFile = gr.Textbox()
	origFile = gr.Textbox()


	demo.load(fn=get_config, inputs=[chatbot, convo], outputs=[show_img, txtFile, chatbot, convo, origFile], js=js
	).then(updateDIR,[origFile])
	msg.submit(user, [msg, chatbot, convo], [msg, chatbot, convo], queue=False
	).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox]
	).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}")
	send.click(user, [msg, chatbot, convo], [msg, chatbot, convo ], queue=False
	).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox]
	).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}")
	btn.upload(
	fn=purge_chat_and_render_first,
	inputs=[btn, chatbot, numThr, myFile, origFile],
	outputs=[chatbot, numThr, txtFile, show_img, origFile],
	#js="(btn, chatbot, numThr, myFile) => {set_cookie('filepath', myFile);}",
	).then(processChain,[txtFile, origFile], js="(mFile, origFile) => {set_cookie('filepath', mFile); set_cookie('origFile', origFile);}")


	if __name__ == '__main__':
	path = 'files'
	if not os.path.exists(path):
	os.makedirs(path)
	path = 'db2'
	if not os.path.exists(path):
	os.makedirs(path)

	demo.queue()
	demo.launch()