Spaces:
Sleeping
Sleeping
from langchain_community.llms import Ollama | |
from langchain_huggingface import HuggingFacePipeline | |
from langchain.chains import RetrievalQA | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from langchain.chains import create_retrieval_chain, SequentialChain | |
from langchain.chains.combine_documents import create_stuff_documents_chain | |
from langchain.chains.question_answering import load_qa_chain | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain.prompts import PromptTemplate | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.callbacks.base import BaseCallbackHandler | |
from langchain.callbacks.manager import CallbackManager | |
from langchain.callbacks import StdOutCallbackHandler | |
#from langchain_openai import ChatOpenAI | |
from transformers import pipeline, TextIteratorStreamer, AutoTokenizer, AutoModel, AutoModelForCausalLM | |
from langchain_text_splitters import RecursiveCharacterTextSplitter | |
from huggingface_hub import login | |
from threading import Thread | |
from typing import Any, List | |
import ast, torch | |
from http import cookies | |
from langchain_chroma import Chroma | |
import json | |
from queue import Queue, Empty | |
from scapy.all import rdpcap | |
import chromadb | |
from chromadb.utils import embedding_functions | |
from chromadb.utils.embedding_functions import create_langchain_embedding | |
import gradio as gr | |
import random, time, pymupdf | |
from PIL import Image, ImageDraw, ImageFont | |
from pathlib import Path | |
import requests | |
from chromadb.config import Settings | |
from chromadb.api.shared_system_client import SharedSystemClient | |
from tqdm import tqdm | |
from langchain_community.document_loaders import ( | |
CSVLoader, | |
TextLoader, | |
UnstructuredPowerPointLoader, | |
PyMuPDFLoader, | |
UnstructuredWordDocumentLoader, | |
) | |
import os, sys, getopt, glob, shutil, string, platform, gc, re, asyncio | |
from reportlab.lib.pagesizes import letter | |
from reportlab.pdfgen import canvas | |
import itertools | |
from pcap import pcapanalyze | |
############Initialize global variables ############## | |
chat_history = [] | |
count = 0 | |
nrThread = 100 | |
q = Queue() | |
job_done = object() | |
#globalChain = None | |
PERSIST_DIR = 'db2/' | |
js = '''function js(){window.set_cookie = function(key, value){document.cookie = key+'='+value+'; Path=/; SameSite=Strict';return [value]}}''' | |
CHROMA_SETTINGS = Settings( | |
persist_directory=PERSIST_DIR, | |
anonymized_telemetry=False, | |
) | |
model_name = 'BAAI/bge-large-en-v1.5' | |
model_kwargs = {'device': 'cpu'} | |
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity | |
#embeddings = OllamaEmbeddings(model="mxbai-embed-large") | |
LOADER_MAPPING = { | |
".csv": (CSVLoader, {}), | |
".txt": (TextLoader, {"encoding": "utf8"}), | |
".ppt": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}), | |
".pptx": (UnstructuredPowerPointLoader, {"mode":"single", "strategy": "high_res"}), | |
".doc": (UnstructuredWordDocumentLoader, {}), | |
".docx": (UnstructuredWordDocumentLoader, {}), | |
".pdf": (PyMuPDFLoader, {}), | |
".pdfo": (PyMuPDFLoader, {'extract_images':True}), | |
} | |
system_prompt = ( | |
"Use the given context to answer the question. " | |
"If the question is not related to the context, then use your general knowledge to answer the question. " | |
"If you don't know the answer, just say that you don't know, don't try to make up an answer. " | |
"Use three sentences maximum and keep the answer concise. " | |
"Your name is YVAN and you are an AI specializing information related to the C4ISR of the Philippine Navy. " | |
"Context: {context}" | |
) | |
user_prompt = ChatPromptTemplate.from_messages( | |
[ | |
("system", system_prompt), | |
("human", "{input}"), | |
] | |
) | |
##################for File Ingestion | |
def remove_ansi_escape_sequences(input_string): | |
# Define a regular expression pattern to match ANSI escape sequences | |
ansi_escape_pattern = r'\x1B(?:[@-_]|[\x80-\x9F])[0-?]*[ -/]*[@-~]' | |
# Use re.sub() to replace ANSI escape sequences with an empty string | |
cleaned_string = re.sub(ansi_escape_pattern, '', input_string) | |
cleaned_string = cleaned_string.encode("ascii", errors='ignore') | |
cleaned_string = cleaned_string.decode('utf-8', errors='ignore') | |
return cleaned_string | |
def load_single_document(file_path): | |
ext = "." + file_path.rsplit(".", 1)[-1] | |
#print(ext) | |
if ext in LOADER_MAPPING or ext == '.pcap': | |
if (ext == '.pcap'): #check if pcap file | |
pcapana=pcapanalyze(file_path) | |
data, http_data = pcapana.parse_pcap() | |
myRet = pcapana.prepare_documents(data, http_data) | |
with open('files/'+os.path.basename(file_path) + '.txt' ,mode = 'w') as file: | |
for i, doc in enumerate(myRet): | |
file.write(str(doc) + '\n') | |
else: | |
loader_class, loader_args = LOADER_MAPPING[ext] | |
loader = loader_class(file_path, **loader_args) | |
myRet = loader.load() | |
if (ext=='.pdf'): #check OCR in all PDF | |
disStr = '' | |
for doc in myRet: | |
disStr += doc.page_content | |
if (len(disStr)< 100): #if not text possible OCR | |
shutil.copyfile(file_path, 'files/'+os.path.basename(file_path)+'o') | |
loader_class, loader_args = LOADER_MAPPING[ext+'o'] | |
loader = loader_class('files/'+os.path.basename(file_path)+'o', **loader_args) | |
myRet = loader.load() | |
#print(myRet) | |
return myRet | |
raise ValueError(f"Unsupported file extension '{ext}'") | |
def split_docs(docs, size): | |
for i in range(0, len(docs), size): | |
yield docs[i:i + size] | |
def runembed(dbread, docs, i): | |
#print(i) | |
#print(docs) | |
dbread.add_documents([docs]) | |
def remove_ws(d): | |
text = d.page_content.strip() | |
text = text.encode("ascii", errors='ignore') | |
text = text.decode('utf-8', errors='ignore') | |
d.page_content = text | |
d.metadata = '' | |
return d | |
def filtercollname(collname): | |
collname = collname.replace('-','') | |
collname = collname.replace('(','') | |
collname = collname.replace(')','') | |
collname = collname.replace('.','') | |
collname = collname.replace('_','') | |
collname = collname.replace(' ','') | |
return collname | |
def processDocs(file): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0) | |
documents = load_single_document(file) | |
docs = text_splitter.split_documents(documents) | |
docs = [remove_ws(d) for d in docs] | |
totLen = len(docs) | |
#splitsize = 1 #len(docs) | |
#docs = split_docs(docs, splitsize) | |
return docs, totLen | |
def iterateDocs(file, progress, numThr): | |
arrProc = [] | |
countthread = 0 | |
disLen = 0 | |
docs_chunked, totLen = processDocs(file) | |
#print(numThr) | |
collname = os.path.basename(file) | |
collname = filtercollname(collname) | |
#print(collname) | |
dbread = Chroma(persist_directory=PERSIST_DIR) | |
dbread._client.get_or_create_collection(name="collname") | |
#dbread._client.delete_collection(name=collname) | |
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:") | |
for i, doc in enumerate(docs_chunked): | |
if (i==0): | |
#print(doc) | |
dbread = Chroma.from_documents(collection_name="collname", documents=[doc],embedding=embeddings,persist_directory=PERSIST_DIR) | |
progress((i+1)/totLen) | |
else: | |
proc = Thread(target = runembed, args=(dbread, doc, i,)) | |
arrProc.append(proc) | |
proc.start() | |
countthread +=1 | |
if (countthread >= numThr): | |
for z, proc in enumerate(arrProc): | |
proc.join() | |
progress((i+1)/totLen) | |
arrProc = [] | |
countthread = 0 | |
disLen = i | |
for i, proc in enumerate(arrProc): | |
proc.join() | |
progress((disLen+1)/totLen) | |
disLen += 1 | |
return docs_chunked | |
###################End file ingestion | |
###############################For AI Query | |
class QueueCallback(BaseCallbackHandler): | |
"""Callback handler for streaming LLM responses to a queue.""" | |
def __init__(self, q): | |
self.q = q | |
def on_llm_new_token(self, token: str, **kwargs: Any) -> None: | |
self.q.put(token) | |
def on_llm_end(self, *args, **kwargs: Any) -> None: | |
return self.q.empty() | |
def save_string_to_image(text, filename, font_path=None, font_size=40): | |
# Determine the size of the image | |
lines = text.split('\n') | |
max_width = max(len(line) for line in lines) | |
width = max_width * font_size // 20 | |
height = len(lines) * font_size * 2 | |
image = Image.new('RGB', (width, height), color=(255, 255, 255)) | |
draw = ImageDraw.Draw(image) | |
font = ImageFont.truetype(font_path, font_size) if font_path else ImageFont.load_default() | |
y_text = 10 | |
for line in lines: | |
draw.text((2, y_text), line, font=font, fill=(0, 0, 0)) | |
y_text += font_size * 1.5 # Move down by font size for the next line | |
image.save(filename) | |
def answer(question, chain): | |
def task(): | |
response = chain.invoke({"input": question}) | |
#response = chain.invoke({'input_documents':[doc], 'input':question}) | |
q.put(job_done) | |
t = Thread(target=task) | |
t.start() | |
def embedDocs(myPERSIST_DIR, collname ): | |
collname = filtercollname(collname) | |
#print(collname) | |
global dbread, retriever | |
embeddings = HuggingFaceBgeEmbeddings(model_name=model_name,model_kwargs=model_kwargs, encode_kwargs=encode_kwargs, query_instruction="Generate a representation for this sentence that can be used to retrieve related articles:") | |
dbread = Chroma(collection_name="collname", embedding_function=embeddings, persist_directory=myPERSIST_DIR) | |
retriever = dbread.as_retriever(search_kwargs={"k": 4}) | |
return retriever, dbread | |
def analyzeAI(modelname): | |
callbacks = [QueueCallback(q)] | |
llm = Ollama(model=modelname, callbacks=callbacks) | |
return llm | |
def prompt(llm, retriever): | |
question_answer_chain = create_stuff_documents_chain(llm, user_prompt) | |
chain = create_retrieval_chain(retriever, question_answer_chain) | |
return chain | |
##################end AI query | |
################gradio functions | |
def user(user_message, history, convo): | |
convo.append(['user',user_message]) | |
return "", history + [[user_message, None]], convo | |
def bots(history, convo, anser): | |
# bot_message = random.choice(["How are you?", "I love you", "I'm very hungry"]) | |
# time.sleep(2) | |
# history[-1][1] = "" | |
# for character in bot_message: | |
# print(character) | |
# history[-1][1] += character | |
# time.sleep(0.05) | |
# yield character | |
#print(history) | |
question = history[-1][0] | |
history[-1][1] = "" | |
try: | |
answer(question, globalChain) | |
except: | |
yield [[question,'AI not responding. Please reload.']], convo, "Please reload", convo | |
return | |
while True: | |
try: | |
next_token = q.get(True, timeout=1) | |
if next_token is job_done: | |
convo.append(['assistant', anser]) | |
yield history, convo, anser, convo | |
break | |
history[-1][1] += next_token | |
anser += next_token | |
yield history, convo, anser, convo | |
except Empty: | |
continue | |
def purge_chat_and_render_first(file, chatbot, numThr, myFiles, origFile, progress=gr.Progress()): | |
progress(0) | |
myFiles = file.name | |
origFile = os.path.basename(file.name) | |
#Clear persistent DB | |
#system = platform.system() | |
#if system == "Windows": | |
#dbread.reset() | |
#dbread._system.stop() | |
#gc.collect() | |
#try: | |
# os.remove('db2/'+os.path.basename(myFiles)) | |
# print("chroma.sqlite file deleted successfully.") | |
#except OSError as e: | |
# print(f"Error deleting chroma.sqlite: {e}") | |
#load image | |
image = None | |
docs_chunked = iterateDocs(file, progress, numThr) | |
ext = "." + os.path.basename(file).rsplit(".", 1)[-1] | |
if ext != '.pdf': | |
myFiles = Path(os.getcwd()+'/files/' + os.path.basename(file.name) +'.pdf') | |
if os.path.exists('files/'+os.path.basename(file.name) + '.txt'): | |
with open('files/'+os.path.basename(file.name) + '.txt' ,mode = 'r') as file: | |
disStr = file.read(2000) | |
save_string_to_image(disStr, myFiles, font_size=30) | |
else: | |
#print(myDocs) | |
save_string_to_image(str(docs_chunked[0].page_content), myFiles, font_size=30) | |
doc = pymupdf.open(Path(myFiles)) | |
else: doc = pymupdf.open(Path(file.name)) | |
#print(myFiles) | |
page = doc[0] | |
pix = page.get_pixmap(dpi=150) | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
return chatbot, numThr, myFiles, image, origFile | |
css = """ | |
.gradio-container {background-color:#BBD3FB} | |
#rowTitle {text-align: center;} | |
#cssTitle {text-align: center; color: #FFFFFF; background:#3B69B7; font-size: 30px; padding-top: 4px; padding-bottom: 4px;} | |
#cssBut {width:100px; height: 60px;} | |
#cssText {width:100px;} | |
""" | |
#######################end gradio functions | |
def get_config(chatbot, convo, request: gr.Request): | |
config = {'filepath':'none','convo':[], 'origFile':'none'} | |
for key in config: | |
if key in request.cookies: | |
config[key] = request.cookies[key] | |
#print(config['newpath']) | |
image = None | |
myFile = config['filepath'] | |
origFile = config['origFile'] | |
if config['filepath'] is not None: | |
if (config['filepath']!='none' and config['filepath']!='undefined'): | |
if os.path.exists(config['filepath']): | |
#print(config['filepath']) | |
doc = pymupdf.open(config['filepath']) | |
page = doc[0] | |
pix = page.get_pixmap(dpi=150) | |
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) | |
chatbot= [] | |
if ((config['convo'] != []) and (config['convo'] is not None) and (config['convo']!='') and (config['convo']!='undefined')): | |
#listconvo = config['convo'].split(',') | |
#listconvo = json.dumps(config['convo'], separators=(',', ':')) | |
#listconvo = json.loads(config['convo']) | |
listconvo = ast.literal_eval(config['convo']) | |
convo = listconvo | |
for conv in listconvo: | |
#print(conv) | |
if conv[0] == 'user': chatbot = chatbot + [[str(conv[1]), None]] | |
elif conv[0] == 'assistant': chatbot = chatbot + [[None, str(conv[1])]] | |
return image, myFile, chatbot, convo, origFile | |
def updateDIR(myFile): | |
disFile = os.path.basename(myFile) | |
#ext = "." + disFile.rsplit(".", 1)[-1] | |
#if ext == '.pdf': | |
# disFile = disFile[0:len(disFile)-4:1] | |
retriever, dbread = embedDocs('db2/', disFile) | |
#print('here') | |
llm = analyzeAI('llava-llama3') | |
global globalChain | |
globalChain = prompt (llm, retriever) | |
def updateConvo(history): | |
return history | |
def processChain(myFiles, origFile): | |
return myFiles, origFile | |
def clearbot(): | |
return [],[] | |
with gr.Blocks(css=css, fill_width=True) as demo: | |
#PERSIST_DIR = 'db2/'+''.join(random.choices(string.ascii_uppercase + string.digits, k=7))+'/' | |
myFile = gr.State("none") | |
origFile = gr.State("none") | |
numThr = gr.State(100) | |
convo = gr.State([]) | |
anser = gr.State('') | |
myDoc = gr.State([]) | |
with gr.Row(elem_id="rowTitle"): | |
gr.Button(value="Welcome to PN Cyber AI (codename: YVAN)", elem_id="cssTitle") | |
with gr.Row(): | |
with gr.Column(scale=8): | |
with gr.Row(): | |
chatbot = gr.Chatbot() | |
#chatbot.change(updateConvo,[chatbot], js="(convo) => {set_cookie('convo', convo)}") | |
with gr.Row(): | |
with gr.Column(scale=4, min_width=300): | |
msg = gr.Textbox(show_label=False,placeholder = "Ask me") | |
with gr.Column(scale=1, min_width=50): | |
send = gr.Button("Send", elem_id="cssBut") | |
with gr.Column(scale=1, min_width=50): | |
clear = gr.Button("Clear", elem_id="cssBut") | |
clear.click(clearbot, [], [chatbot, convo], queue=False, js="(convbox) => {set_cookie('convo', '')}") | |
with gr.Column(scale=4): | |
with gr.Row(): | |
show_img = gr.Image(label="Upload Logs/pcap/Doc", sources=[("upload")], type="filepath") | |
with gr.Row(): | |
btn = gr.UploadButton("📁 Upload Logs/pcap/Doc", file_types=[".pdf", ".csv", ".pcap", ".txt", ".docx", ".pptx"]) | |
with gr.Row(): | |
slider = gr.Slider(50, 1000, value=100, interactive=True, step=50,label="Thread", info="Choose Number of Threads to multitask") | |
with gr.Row(visible=False): | |
convobox = gr.Textbox() | |
txtFile = gr.Textbox() | |
origFile = gr.Textbox() | |
demo.load(fn=get_config, inputs=[chatbot, convo], outputs=[show_img, txtFile, chatbot, convo, origFile], js=js | |
).then(updateDIR,[origFile]) | |
msg.submit(user, [msg, chatbot, convo], [msg, chatbot, convo], queue=False | |
).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox] | |
).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}") | |
send.click(user, [msg, chatbot, convo], [msg, chatbot, convo ], queue=False | |
).then(bots, [chatbot, convo, anser], [chatbot, convo, anser, convobox] | |
).then(updateConvo,[convobox], js="(convbox) => {set_cookie('convo', convbox)}") | |
btn.upload( | |
fn=purge_chat_and_render_first, | |
inputs=[btn, chatbot, numThr, myFile, origFile], | |
outputs=[chatbot, numThr, txtFile, show_img, origFile], | |
#js="(btn, chatbot, numThr, myFile) => {set_cookie('filepath', myFile);}", | |
).then(processChain,[txtFile, origFile], js="(mFile, origFile) => {set_cookie('filepath', mFile); set_cookie('origFile', origFile);}") | |
if __name__ == '__main__': | |
path = 'files' | |
if not os.path.exists(path): | |
os.makedirs(path) | |
path = 'db2' | |
if not os.path.exists(path): | |
os.makedirs(path) | |
demo.queue() | |
demo.launch() | |