Spaces:
Sleeping
Sleeping
import os | |
import streamlit as st | |
import torch | |
import numpy as np | |
from typing import List, Dict, Any | |
from langchain_core.prompts import ChatPromptTemplate | |
from langchain_groq import ChatGroq | |
from langchain_community.embeddings import HuggingFaceBgeEmbeddings | |
from langchain.memory import ConversationBufferMemory | |
from langchain.chains import ConversationalRetrievalChain | |
from transformers import pipeline | |
from sentence_transformers import SentenceTransformer | |
import tavily | |
# Evaluation Metrics Libraries | |
from rouge_score import rouge_scorer | |
from nltk.translate.bleu_score import sentence_bleu | |
from nltk.tokenize import word_tokenize | |
from sklearn.metrics.pairwise import cosine_similarity | |
from textstat import flesch_reading_ease, flesch_kincaid_grade | |
class AdvancedRAGChatbot: | |
def __init__(self, | |
tavily_api_key: str, | |
embedding_model: str = "BAAI/bge-large-en-v1.5", | |
llm_model: str = "llama-3.3-70b-versatile", | |
temperature: float = 0.7): | |
"""Initialize the Advanced RAG Chatbot with Enhanced Metrics""" | |
os.environ["TAVILY_API_KEY"] = tavily_api_key | |
# Tavily Client | |
self.tavily_client = tavily.TavilyClient(tavily_api_key) | |
# NLP Components | |
self.embeddings = self._configure_embeddings(embedding_model) | |
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2') | |
self.sentiment_analyzer = pipeline("sentiment-analysis") | |
self.ner_pipeline = pipeline("ner", aggregation_strategy="simple") | |
# Evaluation Components | |
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True) | |
# Language Model Configuration | |
self.llm = self._configure_llm(llm_model, temperature) | |
# Conversation Memory | |
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True) | |
def _calculate_comprehensive_metrics(self, query: str, response: str, web_sources: List[Dict]) -> Dict[str, Any]: | |
"""Calculate comprehensive evaluation metrics""" | |
metrics = {} | |
# Readability Metrics | |
metrics['flesch_reading_ease'] = flesch_reading_ease(response) | |
metrics['flesch_kincaid_grade'] = flesch_kincaid_grade(response) | |
# Length Metrics | |
metrics['query_length'] = len(word_tokenize(query)) | |
metrics['response_length'] = len(word_tokenize(response)) | |
# BLEU Score (compared against web sources) | |
reference_texts = [word_tokenize(source.get('content', '')) for source in web_sources] | |
candidate_tokens = word_tokenize(response) | |
bleu_scores = [] | |
for ref in reference_texts: | |
try: | |
bleu_score = sentence_bleu([ref], candidate_tokens) | |
bleu_scores.append(bleu_score) | |
except Exception: | |
pass | |
metrics['average_bleu_score'] = np.mean(bleu_scores) if bleu_scores else 0.0 | |
# ROUGE Scores | |
reference_text = ' '.join([source.get('content', '') for source in web_sources]) | |
rouge_scores = self.rouge_scorer.score(reference_text, response) | |
metrics['rouge_scores'] = { | |
'rouge1': rouge_scores['rouge1'].fmeasure, | |
'rouge2': rouge_scores['rouge2'].fmeasure, | |
'rougeL': rouge_scores['rougeL'].fmeasure | |
} | |
# Semantic Similarity | |
try: | |
web_source_embeddings = self.semantic_model.encode([source.get('content', '') for source in web_sources]) | |
response_embedding = self.semantic_model.encode([response])[0] | |
semantic_similarities = cosine_similarity([response_embedding], web_source_embeddings)[0] | |
metrics['semantic_similarity'] = { | |
'mean': np.mean(semantic_similarities), | |
'max': np.max(semantic_similarities), | |
'min': np.min(semantic_similarities) | |
} | |
except Exception as e: | |
st.warning(f"Semantic similarity calculation error: {e}") | |
metrics['semantic_similarity'] = {'mean': 0, 'max': 0, 'min': 0} | |
return metrics | |
def process_query(self, query: str) -> Dict[str, Any]: | |
"""Process the user query with comprehensive evaluation""" | |
# Web Search | |
web_results = self._tavily_web_search(query) | |
# Prepare context from web search | |
context = "\n\n".join([ | |
f"Title: {result.get('title', 'N/A')}\nContent: {result.get('content', '')}" | |
for result in web_results | |
]) | |
# NLP Analysis | |
semantic_score = self.semantic_model.encode([query])[0] | |
sentiment_result = self.sentiment_analyzer(query)[0] | |
# Safe NER processing | |
try: | |
entities = self.ner_pipeline(query) | |
except Exception as e: | |
st.warning(f"NER processing error: {e}") | |
entities = [] | |
# Prepare prompt with web search context | |
full_prompt = f""" | |
Use the following web search results to answer the question precisely: | |
Web Search Context: | |
{context} | |
Question: {query} | |
Provide a comprehensive answer based on the web search results. | |
""" | |
# Generate Response | |
response = self.llm.invoke(full_prompt) | |
response_content = response.content | |
# Calculate Comprehensive Metrics | |
evaluation_metrics = self._calculate_comprehensive_metrics( | |
query, | |
response_content, | |
web_results | |
) | |
return { | |
"response": response_content, | |
"web_sources": web_results, | |
"semantic_similarity": semantic_score.tolist(), | |
"sentiment": sentiment_result, | |
"named_entities": entities, | |
"evaluation_metrics": evaluation_metrics | |
} | |
def main(): | |
# [Previous main function code remains the same] | |
# Add a new section to display comprehensive metrics | |
with col2: | |
st.header("Response & Metrics") | |
if submit_button and user_input: | |
with st.spinner("Searching web and processing query..."): | |
try: | |
response = chatbot.process_query(user_input) | |
# Existing response display code... | |
# Comprehensive Metrics Display | |
st.markdown("### π Comprehensive Evaluation Metrics") | |
# Readability Metrics | |
col_read1, col_read2 = st.columns(2) | |
with col_read1: | |
st.metric( | |
"Flesch Reading Ease", | |
f"{response['evaluation_metrics']['flesch_reading_ease']:.2f}", | |
help="Higher scores indicate easier readability" | |
) | |
with col_read2: | |
st.metric( | |
"Flesch-Kincaid Grade", | |
f"{response['evaluation_metrics']['flesch_kincaid_grade']:.2f}", | |
help="US grade level required to understand the text" | |
) | |
# Length and BLEU Metrics | |
col_len1, col_len2, col_len3 = st.columns(3) | |
with col_len1: | |
st.metric("Query Length", response['evaluation_metrics']['query_length']) | |
with col_len2: | |
st.metric("Response Length", response['evaluation_metrics']['response_length']) | |
with col_len3: | |
st.metric( | |
"BLEU Score", | |
f"{response['evaluation_metrics']['average_bleu_score']:.4f}", | |
help="Measures similarity to reference texts" | |
) | |
# ROUGE Scores | |
st.markdown("#### π ROUGE Scores") | |
rouge_metrics = response['evaluation_metrics']['rouge_scores'] | |
col_rouge1, col_rouge2, col_rouge3 = st.columns(3) | |
with col_rouge1: | |
st.metric("ROUGE-1", f"{rouge_metrics['rouge1']:.4f}") | |
with col_rouge2: | |
st.metric("ROUGE-2", f"{rouge_metrics['rouge2']:.4f}") | |
with col_rouge3: | |
st.metric("ROUGE-L", f"{rouge_metrics['rougeL']:.4f}") | |
# Semantic Similarity | |
st.markdown("#### π Semantic Similarity") | |
sem_sim = response['evaluation_metrics']['semantic_similarity'] | |
col_sem1, col_sem2, col_sem3 = st.columns(3) | |
with col_sem1: | |
st.metric("Mean Similarity", f"{sem_sim['mean']:.4f}") | |
with col_sem2: | |
st.metric("Max Similarity", f"{sem_sim['max']:.4f}") | |
with col_sem3: | |
st.metric("Min Similarity", f"{sem_sim['min']:.4f}") | |
except Exception as e: | |
st.error(f"An error occurred: {e}") | |
if __name__ == "__main__": | |
main() |