NewsInferno / app.py
HEHEBOIOG's picture
Update app.py
0e6bdaf verified
raw
history blame
9.42 kB
import os
import streamlit as st
import torch
import numpy as np
from typing import List, Dict, Any
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain.memory import ConversationBufferMemory
from langchain.chains import ConversationalRetrievalChain
from transformers import pipeline
from sentence_transformers import SentenceTransformer
import tavily
# Evaluation Metrics Libraries
from rouge_score import rouge_scorer
from nltk.translate.bleu_score import sentence_bleu
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import cosine_similarity
from textstat import flesch_reading_ease, flesch_kincaid_grade
class AdvancedRAGChatbot:
def __init__(self,
tavily_api_key: str,
embedding_model: str = "BAAI/bge-large-en-v1.5",
llm_model: str = "llama-3.3-70b-versatile",
temperature: float = 0.7):
"""Initialize the Advanced RAG Chatbot with Enhanced Metrics"""
os.environ["TAVILY_API_KEY"] = tavily_api_key
# Tavily Client
self.tavily_client = tavily.TavilyClient(tavily_api_key)
# NLP Components
self.embeddings = self._configure_embeddings(embedding_model)
self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
self.sentiment_analyzer = pipeline("sentiment-analysis")
self.ner_pipeline = pipeline("ner", aggregation_strategy="simple")
# Evaluation Components
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
# Language Model Configuration
self.llm = self._configure_llm(llm_model, temperature)
# Conversation Memory
self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)
def _calculate_comprehensive_metrics(self, query: str, response: str, web_sources: List[Dict]) -> Dict[str, Any]:
"""Calculate comprehensive evaluation metrics"""
metrics = {}
# Readability Metrics
metrics['flesch_reading_ease'] = flesch_reading_ease(response)
metrics['flesch_kincaid_grade'] = flesch_kincaid_grade(response)
# Length Metrics
metrics['query_length'] = len(word_tokenize(query))
metrics['response_length'] = len(word_tokenize(response))
# BLEU Score (compared against web sources)
reference_texts = [word_tokenize(source.get('content', '')) for source in web_sources]
candidate_tokens = word_tokenize(response)
bleu_scores = []
for ref in reference_texts:
try:
bleu_score = sentence_bleu([ref], candidate_tokens)
bleu_scores.append(bleu_score)
except Exception:
pass
metrics['average_bleu_score'] = np.mean(bleu_scores) if bleu_scores else 0.0
# ROUGE Scores
reference_text = ' '.join([source.get('content', '') for source in web_sources])
rouge_scores = self.rouge_scorer.score(reference_text, response)
metrics['rouge_scores'] = {
'rouge1': rouge_scores['rouge1'].fmeasure,
'rouge2': rouge_scores['rouge2'].fmeasure,
'rougeL': rouge_scores['rougeL'].fmeasure
}
# Semantic Similarity
try:
web_source_embeddings = self.semantic_model.encode([source.get('content', '') for source in web_sources])
response_embedding = self.semantic_model.encode([response])[0]
semantic_similarities = cosine_similarity([response_embedding], web_source_embeddings)[0]
metrics['semantic_similarity'] = {
'mean': np.mean(semantic_similarities),
'max': np.max(semantic_similarities),
'min': np.min(semantic_similarities)
}
except Exception as e:
st.warning(f"Semantic similarity calculation error: {e}")
metrics['semantic_similarity'] = {'mean': 0, 'max': 0, 'min': 0}
return metrics
def process_query(self, query: str) -> Dict[str, Any]:
"""Process the user query with comprehensive evaluation"""
# Web Search
web_results = self._tavily_web_search(query)
# Prepare context from web search
context = "\n\n".join([
f"Title: {result.get('title', 'N/A')}\nContent: {result.get('content', '')}"
for result in web_results
])
# NLP Analysis
semantic_score = self.semantic_model.encode([query])[0]
sentiment_result = self.sentiment_analyzer(query)[0]
# Safe NER processing
try:
entities = self.ner_pipeline(query)
except Exception as e:
st.warning(f"NER processing error: {e}")
entities = []
# Prepare prompt with web search context
full_prompt = f"""
Use the following web search results to answer the question precisely:
Web Search Context:
{context}
Question: {query}
Provide a comprehensive answer based on the web search results.
"""
# Generate Response
response = self.llm.invoke(full_prompt)
response_content = response.content
# Calculate Comprehensive Metrics
evaluation_metrics = self._calculate_comprehensive_metrics(
query,
response_content,
web_results
)
return {
"response": response_content,
"web_sources": web_results,
"semantic_similarity": semantic_score.tolist(),
"sentiment": sentiment_result,
"named_entities": entities,
"evaluation_metrics": evaluation_metrics
}
def main():
# [Previous main function code remains the same]
# Add a new section to display comprehensive metrics
with col2:
st.header("Response & Metrics")
if submit_button and user_input:
with st.spinner("Searching web and processing query..."):
try:
response = chatbot.process_query(user_input)
# Existing response display code...
# Comprehensive Metrics Display
st.markdown("### πŸ“Š Comprehensive Evaluation Metrics")
# Readability Metrics
col_read1, col_read2 = st.columns(2)
with col_read1:
st.metric(
"Flesch Reading Ease",
f"{response['evaluation_metrics']['flesch_reading_ease']:.2f}",
help="Higher scores indicate easier readability"
)
with col_read2:
st.metric(
"Flesch-Kincaid Grade",
f"{response['evaluation_metrics']['flesch_kincaid_grade']:.2f}",
help="US grade level required to understand the text"
)
# Length and BLEU Metrics
col_len1, col_len2, col_len3 = st.columns(3)
with col_len1:
st.metric("Query Length", response['evaluation_metrics']['query_length'])
with col_len2:
st.metric("Response Length", response['evaluation_metrics']['response_length'])
with col_len3:
st.metric(
"BLEU Score",
f"{response['evaluation_metrics']['average_bleu_score']:.4f}",
help="Measures similarity to reference texts"
)
# ROUGE Scores
st.markdown("#### πŸ“ˆ ROUGE Scores")
rouge_metrics = response['evaluation_metrics']['rouge_scores']
col_rouge1, col_rouge2, col_rouge3 = st.columns(3)
with col_rouge1:
st.metric("ROUGE-1", f"{rouge_metrics['rouge1']:.4f}")
with col_rouge2:
st.metric("ROUGE-2", f"{rouge_metrics['rouge2']:.4f}")
with col_rouge3:
st.metric("ROUGE-L", f"{rouge_metrics['rougeL']:.4f}")
# Semantic Similarity
st.markdown("#### πŸ” Semantic Similarity")
sem_sim = response['evaluation_metrics']['semantic_similarity']
col_sem1, col_sem2, col_sem3 = st.columns(3)
with col_sem1:
st.metric("Mean Similarity", f"{sem_sim['mean']:.4f}")
with col_sem2:
st.metric("Max Similarity", f"{sem_sim['max']:.4f}")
with col_sem3:
st.metric("Min Similarity", f"{sem_sim['min']:.4f}")
except Exception as e:
st.error(f"An error occurred: {e}")
if __name__ == "__main__":
main()