Spaces:

HEHEBOIOG
/

NewsInferno

Sleeping

App Files Files Community

NewsInferno / app.py

HEHEBOIOG

Update app.py

0e6bdaf verified 2 months ago

raw

history blame

9.42 kB

	import os
	import streamlit as st
	import torch
	import numpy as np
	from typing import List, Dict, Any
	from langchain_core.prompts import ChatPromptTemplate
	from langchain_groq import ChatGroq
	from langchain_community.embeddings import HuggingFaceBgeEmbeddings
	from langchain.memory import ConversationBufferMemory
	from langchain.chains import ConversationalRetrievalChain
	from transformers import pipeline
	from sentence_transformers import SentenceTransformer
	import tavily

	# Evaluation Metrics Libraries
	from rouge_score import rouge_scorer
	from nltk.translate.bleu_score import sentence_bleu
	from nltk.tokenize import word_tokenize
	from sklearn.metrics.pairwise import cosine_similarity
	from textstat import flesch_reading_ease, flesch_kincaid_grade

	class AdvancedRAGChatbot:
	def __init__(self,
	tavily_api_key: str,
	embedding_model: str = "BAAI/bge-large-en-v1.5",
	llm_model: str = "llama-3.3-70b-versatile",
	temperature: float = 0.7):
	"""Initialize the Advanced RAG Chatbot with Enhanced Metrics"""
	os.environ["TAVILY_API_KEY"] = tavily_api_key

	# Tavily Client
	self.tavily_client = tavily.TavilyClient(tavily_api_key)

	# NLP Components
	self.embeddings = self._configure_embeddings(embedding_model)
	self.semantic_model = SentenceTransformer('all-MiniLM-L6-v2')
	self.sentiment_analyzer = pipeline("sentiment-analysis")
	self.ner_pipeline = pipeline("ner", aggregation_strategy="simple")

	# Evaluation Components
	self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)

	# Language Model Configuration
	self.llm = self._configure_llm(llm_model, temperature)

	# Conversation Memory
	self.memory = ConversationBufferMemory(memory_key="chat_history", return_messages=True)

	def _calculate_comprehensive_metrics(self, query: str, response: str, web_sources: List[Dict]) -> Dict[str, Any]:
	"""Calculate comprehensive evaluation metrics"""
	metrics = {}

	# Readability Metrics
	metrics['flesch_reading_ease'] = flesch_reading_ease(response)
	metrics['flesch_kincaid_grade'] = flesch_kincaid_grade(response)

	# Length Metrics
	metrics['query_length'] = len(word_tokenize(query))
	metrics['response_length'] = len(word_tokenize(response))

	# BLEU Score (compared against web sources)
	reference_texts = [word_tokenize(source.get('content', '')) for source in web_sources]
	candidate_tokens = word_tokenize(response)

	bleu_scores = []
	for ref in reference_texts:
	try:
	bleu_score = sentence_bleu([ref], candidate_tokens)
	bleu_scores.append(bleu_score)
	except Exception:
	pass

	metrics['average_bleu_score'] = np.mean(bleu_scores) if bleu_scores else 0.0

	# ROUGE Scores
	reference_text = ' '.join([source.get('content', '') for source in web_sources])
	rouge_scores = self.rouge_scorer.score(reference_text, response)
	metrics['rouge_scores'] = {
	'rouge1': rouge_scores['rouge1'].fmeasure,
	'rouge2': rouge_scores['rouge2'].fmeasure,
	'rougeL': rouge_scores['rougeL'].fmeasure
	}

	# Semantic Similarity
	try:
	web_source_embeddings = self.semantic_model.encode([source.get('content', '') for source in web_sources])
	response_embedding = self.semantic_model.encode([response])[0]

	semantic_similarities = cosine_similarity([response_embedding], web_source_embeddings)[0]
	metrics['semantic_similarity'] = {
	'mean': np.mean(semantic_similarities),
	'max': np.max(semantic_similarities),
	'min': np.min(semantic_similarities)
	}
	except Exception as e:
	st.warning(f"Semantic similarity calculation error: {e}")
	metrics['semantic_similarity'] = {'mean': 0, 'max': 0, 'min': 0}

	return metrics

	def process_query(self, query: str) -> Dict[str, Any]:
	"""Process the user query with comprehensive evaluation"""
	# Web Search
	web_results = self._tavily_web_search(query)

	# Prepare context from web search
	context = "\n\n".join([
	f"Title: {result.get('title', 'N/A')}\nContent: {result.get('content', '')}"
	for result in web_results
	])

	# NLP Analysis
	semantic_score = self.semantic_model.encode([query])[0]
	sentiment_result = self.sentiment_analyzer(query)[0]

	# Safe NER processing
	try:
	entities = self.ner_pipeline(query)
	except Exception as e:
	st.warning(f"NER processing error: {e}")
	entities = []

	# Prepare prompt with web search context
	full_prompt = f"""
	Use the following web search results to answer the question precisely:

	Web Search Context:
	{context}

	Question: {query}

	Provide a comprehensive answer based on the web search results.
	"""

	# Generate Response
	response = self.llm.invoke(full_prompt)
	response_content = response.content

	# Calculate Comprehensive Metrics
	evaluation_metrics = self._calculate_comprehensive_metrics(
	query,
	response_content,
	web_results
	)

	return {
	"response": response_content,
	"web_sources": web_results,
	"semantic_similarity": semantic_score.tolist(),
	"sentiment": sentiment_result,
	"named_entities": entities,
	"evaluation_metrics": evaluation_metrics
	}

	def main():
	# [Previous main function code remains the same]
	# Add a new section to display comprehensive metrics
	with col2:
	st.header("Response & Metrics")
	if submit_button and user_input:
	with st.spinner("Searching web and processing query..."):
	try:
	response = chatbot.process_query(user_input)

	# Existing response display code...

	# Comprehensive Metrics Display
	st.markdown("### 📊 Comprehensive Evaluation Metrics")

	# Readability Metrics
	col_read1, col_read2 = st.columns(2)
	with col_read1:
	st.metric(
	"Flesch Reading Ease",
	f"{response['evaluation_metrics']['flesch_reading_ease']:.2f}",
	help="Higher scores indicate easier readability"
	)
	with col_read2:
	st.metric(
	"Flesch-Kincaid Grade",
	f"{response['evaluation_metrics']['flesch_kincaid_grade']:.2f}",
	help="US grade level required to understand the text"
	)

	# Length and BLEU Metrics
	col_len1, col_len2, col_len3 = st.columns(3)
	with col_len1:
	st.metric("Query Length", response['evaluation_metrics']['query_length'])
	with col_len2:
	st.metric("Response Length", response['evaluation_metrics']['response_length'])
	with col_len3:
	st.metric(
	"BLEU Score",
	f"{response['evaluation_metrics']['average_bleu_score']:.4f}",
	help="Measures similarity to reference texts"
	)

	# ROUGE Scores
	st.markdown("#### 📈 ROUGE Scores")
	rouge_metrics = response['evaluation_metrics']['rouge_scores']
	col_rouge1, col_rouge2, col_rouge3 = st.columns(3)
	with col_rouge1:
	st.metric("ROUGE-1", f"{rouge_metrics['rouge1']:.4f}")
	with col_rouge2:
	st.metric("ROUGE-2", f"{rouge_metrics['rouge2']:.4f}")
	with col_rouge3:
	st.metric("ROUGE-L", f"{rouge_metrics['rougeL']:.4f}")

	# Semantic Similarity
	st.markdown("#### 🔍 Semantic Similarity")
	sem_sim = response['evaluation_metrics']['semantic_similarity']
	col_sem1, col_sem2, col_sem3 = st.columns(3)
	with col_sem1:
	st.metric("Mean Similarity", f"{sem_sim['mean']:.4f}")
	with col_sem2:
	st.metric("Max Similarity", f"{sem_sim['max']:.4f}")
	with col_sem3:
	st.metric("Min Similarity", f"{sem_sim['min']:.4f}")

	except Exception as e:
	st.error(f"An error occurred: {e}")

	if __name__ == "__main__":
	main()