# import gradio as gr | |
# import torch | |
# from transformers import ( | |
# AutoModelForCausalLM, | |
# AutoTokenizer, | |
# TextIteratorStreamer, | |
# pipeline | |
# ) | |
# import os | |
# from threading import Thread | |
# import spaces | |
# import time | |
# import langchain | |
# import os | |
# import glob | |
# import gc | |
# # loaders | |
# from langchain.document_loaders import PyPDFLoader, DirectoryLoader | |
# # splits | |
# from langchain.text_splitter import RecursiveCharacterTextSplitter | |
# # prompts | |
# from langchain import PromptTemplate | |
# # vector stores | |
# from langchain_community.vectorstores import FAISS | |
# # models | |
# from langchain.llms import HuggingFacePipeline | |
# from langchain.embeddings import HuggingFaceInstructEmbeddings | |
# # retrievers | |
# from langchain.chains import RetrievalQA | |
# import subprocess | |
# subprocess.run( | |
# "pip install flash-attn --no-build-isolation", | |
# env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, | |
# shell=True, | |
# ) | |
# class CFG: | |
# DEBUG = False | |
# ### LLM | |
# model_name = 'justinj92/phi3-orpo' | |
# temperature = 0.7 | |
# top_p = 0.90 | |
# repetition_penalty = 1.15 | |
# max_len = 8192 | |
# max_new_tokens = 512 | |
# ### splitting | |
# split_chunk_size = 800 | |
# split_overlap = 400 | |
# ### embeddings | |
# embeddings_model_repo = 'BAAI/bge-base-en-v1.5' | |
# ### similar passages | |
# k = 6 | |
# ### paths | |
# PDFs_path = './data' | |
# Embeddings_path = './embeddings/input' | |
# Output_folder = './ml-papers-vector' | |
# loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader) | |
# documents = loader.load() | |
# text_splitter = RecursiveCharacterTextSplitter(chunk_size = CFG.split_chunk_size, chunk_overlap = CFG.split_overlap) | |
# texts = text_splitter.split_documents(documents) | |
# if not os.path.exists(CFG.Embeddings_path + '/index.faiss'): | |
# embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"}) | |
# vectordb = FAISS.from_documents(documents=texts, embedding=embeddings) | |
# vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers") | |
# embeddings = HuggingFaceInstructEmbeddings(model_name = CFG.embeddings_model_repo, model_kwargs={"device":"cuda"}) | |
# vectordb = FAISS.load_local(CFG.Output_folder + '/faiss_index_ml_papers', embeddings, allow_dangerous_deserialization=True) | |
# def build_model(model_repo = CFG.model_name): | |
# tokenizer = AutoTokenizer.from_pretrained(model_repo) | |
# model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16) | |
# if torch.cuda.is_available(): | |
# device = torch.device("cuda") | |
# print(f"Using GPU: {torch.cuda.get_device_name(device)}") | |
# else: | |
# device = torch.device("cpu") | |
# print("Using CPU") | |
# device = torch.device("cuda") | |
# model = model.to(device) | |
# return tokenizer, model | |
# tok, model = build_model(model_repo = CFG.model_name) | |
# terminators = [ | |
# tok.eos_token_id, | |
# 32007, | |
# 32011, | |
# 32001, | |
# 32000 | |
# ] | |
# pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty) | |
# llm = HuggingFacePipeline(pipeline = pipe) | |
# prompt_template = """ | |
# <|system|> | |
# You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs). | |
# You are given some extracted parts from machine learning papers along with a question. | |
# If you don't know the answer, just say "I don't know." Don't try to make up an answer. | |
# It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that. | |
# Use only the following pieces of context to answer the question at the end. | |
# <|end|> | |
# <|user|> | |
# Context: {context} | |
# Question is below. Remember to answer in the same language: | |
# Question: {question} | |
# <|end|> | |
# <|assistant|> | |
# """ | |
# PROMPT = PromptTemplate( | |
# template = prompt_template, | |
# input_variables = ["context", "question"] | |
# ) | |
# retriever = vectordb.as_retriever( | |
# search_type = "similarity", | |
# search_kwargs = {"k": CFG.k} | |
# ) | |
# qa_chain = RetrievalQA.from_chain_type( | |
# llm = llm, | |
# chain_type = "stuff", # map_reduce, map_rerank, stuff, refine | |
# retriever = retriever, | |
# chain_type_kwargs = {"prompt": PROMPT}, | |
# return_source_documents = True, | |
# verbose = False | |
# ) | |
# def wrap_text_preserve_newlines(text, width=1500): | |
# # Split the input text into lines based on newline characters | |
# lines = text.split('\n') | |
# # Wrap each line individually | |
# wrapped_lines = [textwrap.fill(line, width=width) for line in lines] | |
# # Join the wrapped lines back together using newline characters | |
# wrapped_text = '\n'.join(wrapped_lines) | |
# return wrapped_text | |
# def process_llm_response(llm_response): | |
# ans = wrap_text_preserve_newlines(llm_response['result']) | |
# sources_used = ' \n'.join( | |
# [ | |
# source.metadata['source'].split('/')[-1][:-4] | |
# + ' - page: ' | |
# + str(source.metadata['page']) | |
# for source in llm_response['source_documents'] | |
# ] | |
# ) | |
# ans = ans + '\n\nSources: \n' + sources_used | |
# ### return only the text after the pattern | |
# pattern = "<|assistant|>" | |
# index = ans.find(pattern) | |
# if index != -1: | |
# ans = ans[index + len(pattern):] | |
# return ans.strip() | |
# @spaces.GPU | |
# def llm_ans(message, history): | |
# llm_response = qa_chain.invoke(message) | |
# ans = process_llm_response(llm_response) | |
# return ans | |
# # @spaces.GPU(duration=60) | |
# # def chat(message, history, temperature, do_sample, max_tokens): | |
# # chat = [{"role": "system", "content": "You are ORPO Tuned Phi Beast. Answer all questions in the most helpful way. No yapping."}] | |
# # for item in history: | |
# # chat.append({"role": "user", "content": item[0]}) | |
# # if item[1] is not None: | |
# # chat.append({"role": "assistant", "content": item[1]}) | |
# # chat.append({"role": "user", "content": message}) | |
# # messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True) | |
# # model_inputs = tok([messages], return_tensors="pt").to(device) | |
# # streamer = TextIteratorStreamer( | |
# # tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True | |
# # ) | |
# # generate_kwargs = dict( | |
# # model_inputs, | |
# # streamer=streamer, | |
# # max_new_tokens=max_tokens, | |
# # do_sample=True, | |
# # temperature=temperature, | |
# # eos_token_id=terminators, | |
# # ) | |
# # if temperature == 0: | |
# # generate_kwargs["do_sample"] = False | |
# # t = Thread(target=model.generate, kwargs=generate_kwargs) | |
# # t.start() | |
# # partial_text = "" | |
# # for new_text in streamer: | |
# # partial_text += new_text | |
# # yield partial_text | |
# # yield partial_text | |
# demo = gr.ChatInterface( | |
# fn=llm_ans, | |
# examples=[["Write me a poem about Machine Learning."]], | |
# # multimodal=False, | |
# stop_btn="Stop Generation", | |
# title="Chat With LLMs", | |
# description="Now Running Phi3-ORPO", | |
# ) | |
# demo.launch() | |
import gradio as gr | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline | |
import os | |
import spaces | |
from threading import Thread | |
import langchain | |
from langchain.document_loaders import DirectoryLoader, PyPDFLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain import PromptTemplate | |
from langchain_community.vectorstores import FAISS | |
from langchain.llms import HuggingFacePipeline | |
from langchain.embeddings import HuggingFaceInstructEmbeddings | |
from langchain.chains import RetrievalQA | |
import subprocess | |
import textwrap | |
# Installation command for specific libraries | |
subprocess.run("pip install flash-attn --no-build-isolation", env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"}, shell=True) | |
class CFG: | |
DEBUG = False | |
model_name = 'justinj92/phi3-orpo' | |
temperature = 0.7 | |
top_p = 0.90 | |
repetition_penalty = 1.15 | |
max_len = 8192 | |
max_new_tokens = 512 | |
split_chunk_size = 800 | |
split_overlap = 400 | |
embeddings_model_repo = 'BAAI/bge-base-en-v1.5' | |
k = 6 | |
PDFs_path = './data' | |
Embeddings_path = './embeddings/input' | |
Output_folder = './ml-papers-vector' | |
loader = DirectoryLoader(CFG.PDFs_path, glob="*.pdf", loader_cls=PyPDFLoader) | |
documents = loader.load() | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CFG.split_chunk_size, chunk_overlap=CFG.split_overlap) | |
texts = text_splitter.split_documents(documents) | |
if not os.path.exists(f"{CFG.Embeddings_path}/index.faiss"): | |
embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"}) | |
vectordb = FAISS.from_documents(documents=texts, embedding=embeddings) | |
vectordb.save_local(f"{CFG.Output_folder}/faiss_index_ml_papers") | |
embeddings = HuggingFaceInstructEmbeddings(model_name=CFG.embeddings_model_repo, model_kwargs={"device":"cuda"}) | |
vectordb = FAISS.load_local(f"{CFG.Output_folder}/faiss_index_ml_papers", embeddings, allow_dangerous_deserialization=True) | |
def build_model(model_repo=CFG.model_name): | |
tokenizer = AutoTokenizer.from_pretrained(model_repo) | |
model = AutoModelForCausalLM.from_pretrained(model_repo, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16) | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = model.to(device) | |
return tokenizer, model | |
tok, model = build_model() | |
terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000] | |
pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty) | |
llm = HuggingFacePipeline(pipeline=pipe) | |
prompt_template = """ | |
You are an expert assistant that answers questions about machine learning and Large Language Models (LLMs). | |
You are given some extracted parts from machine learning papers along with a question. | |
If you don't know the answer, just say "I don't know." Don't try to make up an answer. | |
It is very important that you ALWAYS answer the question in the same language the question is in. Remember to always do that. | |
Use only the following pieces of context to answer the question at the end. | |
Context: {context} | |
Question is below. Remember to answer in the same language: | |
Question: {question} | |
""" | |
PROMPT = PromptTemplate(template=prompt_template, input_variables=["context", "question"]) | |
retriever = vectordb.as_retriever(search_type="similarity", search_kwargs={"k": CFG.k}) | |
def process_llm_response(llm_response): | |
ans = textwrap.fill(llm_response['result'], width=1500) | |
sources_used = ' \n'.join([f"{source.metadata['source'].split('/')[-1][:-4]} - page: {str(source.metadata['page'])}" for source in llm_response['source_documents']]) | |
return f"{ans}\n\nSources:\n{sources_used}" | |
def llm_ans(message, history): | |
tok, model = build_model() | |
terminators = [tok.eos_token_id, 32007, 32011, 32001, 32000] | |
pipe = pipeline(task="text-generation", model=model, tokenizer=tok, eos_token_id=terminators, do_sample=True, max_new_tokens=CFG.max_new_tokens, temperature=CFG.temperature, top_p=CFG.top_p, repetition_penalty=CFG.repetition_penalty) | |
llm = HuggingFacePipeline(pipeline=pipe) | |
qa_chain = RetrievalQA(llm=llm, retriever=retriever, prompt_template=PROMPT, return_source_documents=True, verbose=False) | |
llm_response = qa_chain.invoke(message) | |
return process_llm_response(llm_response) | |
demo = gr.ChatInterface( | |
fn=llm_ans, | |
examples=[["Write me a poem about Machine Learning."]], | |
# multimodal=False, | |
stop_btn="Stop Generation", | |
title="Chat With LLMs", | |
description="Now Running Phi3-ORPO", | |
) | |
demo.launch() | |