DeepResearchEvaluator

Running on CPU Upgrade

App Files Files Community

awacke1 commited on Dec 31, 2024

Commit

bd477c5

verified ·

1 Parent(s): 17f4825

Update app.py

Browse files

Files changed (1) hide show

app.py +259 -199

app.py CHANGED Viewed

@@ -34,7 +34,75 @@ st.set_page_config(
 )
 load_dotenv()
-# 🔑 2. API Setup & Clients
 openai_api_key = os.getenv('OPENAI_API_KEY', "")
 anthropic_key = os.getenv('ANTHROPIC_API_KEY_3', "")
 xai_key = os.getenv('xai',"")
@@ -49,13 +117,13 @@ openai_client = OpenAI(api_key=openai.api_key, organization=os.getenv('OPENAI_OR
 HF_KEY = os.getenv('HF_KEY')
 API_URL = os.getenv('API_URL')
-# 📝 3. Session State Management
 if 'transcript_history' not in st.session_state:
     st.session_state['transcript_history'] = []
 if 'chat_history' not in st.session_state:
     st.session_state['chat_history'] = []
 if 'openai_model' not in st.session_state:
-    st.session_state['openai_model'] = "gpt-4o-2024-05-13"
 if 'messages' not in st.session_state:
     st.session_state['messages'] = []
 if 'last_voice_input' not in st.session_state:
@@ -66,21 +134,19 @@ if 'edit_new_name' not in st.session_state:
     st.session_state['edit_new_name'] = ""
 if 'edit_new_content' not in st.session_state:
     st.session_state['edit_new_content'] = ""
-if 'viewing_prefix' not in st.session_state:
     st.session_state['viewing_prefix'] = None
 if 'should_rerun' not in st.session_state:
     st.session_state['should_rerun'] = False
 if 'old_val' not in st.session_state:
     st.session_state['old_val'] = None
-# 🎨 4. Custom CSS
 st.markdown("""
 <style>
     .main { background: linear-gradient(to right, #1a1a1a, #2d2d2d); color: #fff; }
     .stMarkdown { font-family: 'Helvetica Neue', sans-serif; }
-    .stButton>button {
-        margin-right: 0.5rem;
-    }
 </style>
 """, unsafe_allow_html=True)
@@ -89,87 +155,37 @@ FILE_EMOJIS = {
     "mp3": "🎵",
 }
-# 🧠 5. High-Information Content Extraction
 def get_high_info_terms(text: str) -> list:
     """Extract high-information terms from text, including key phrases."""
-    stop_words = set([
-        'the', 'a', 'an', 'and', 'or', 'but', 'in', 'on', 'at', 'to', 'for', 'of', 'with',
-        'by', 'from', 'up', 'about', 'into', 'over', 'after', 'is', 'are', 'was', 'were',
-        'be', 'been', 'being', 'have', 'has', 'had', 'do', 'does', 'did', 'will', 'would',
-        'should', 'could', 'might', 'must', 'shall', 'can', 'may', 'this', 'that', 'these',
-        'those', 'i', 'you', 'he', 'she', 'it', 'we', 'they', 'what', 'which', 'who',
-        'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
-        'other', 'some', 'such', 'than', 'too', 'very', 'just', 'there'
-    ])
-    key_phrases = [
-        'artificial intelligence', 'machine learning', 'deep learning', 'neural network',
-        'personal assistant', 'natural language', 'computer vision', 'data science',
-        'reinforcement learning', 'knowledge graph', 'semantic search', 'time series',
-        'large language model', 'transformer model', 'attention mechanism',
-        'autonomous system', 'edge computing', 'quantum computing', 'blockchain technology',
-        'cognitive science', 'human computer', 'decision making', 'arxiv search',
-        'research paper', 'scientific study', 'empirical analysis'
-    ]
-    # Identify key phrases
-    preserved_phrases = []
-    lower_text = text.lower()
-    for phrase in key_phrases:
-        if phrase in lower_text:
-            preserved_phrases.append(phrase)
-            text = text.replace(phrase, '')
-    # Extract individual words
-    words = re.findall(r'\b\w+(?:-\w+)*\b', text)
-    high_info_words = [
-        word.lower() for word in words
-        if len(word) > 3
-        and word.lower() not in stop_words
-        and not word.isdigit()
-        and any(c.isalpha() for c in word)
-    ]
-    all_terms = preserved_phrases + high_info_words
-    seen = set()
-    unique_terms = []
-    for term in all_terms:
-        if term not in seen:
-            seen.add(term)
-            unique_terms.append(term)
-    max_terms = 5
-    return unique_terms[:max_terms]
 def clean_text_for_filename(text: str) -> str:
     """Remove punctuation and short filler words, return a compact string."""
-    text = text.lower()
-    text = re.sub(r'[^\w\s-]', '', text)
-    words = text.split()
-    stop_short = set(['the','and','for','with','this','that','from','just','very','then','been','only','also','about'])
-    filtered = [w for w in words if len(w)>3 and w not in stop_short]
-    return '_'.join(filtered)[:200]
-# 📁 6. File Operations
 def generate_filename(prompt, response, file_type="md"):
-    """
-    Generate filename with meaningful terms and short dense clips from prompt & response.
-    The filename should be about 150 chars total, include high-info terms, and a clipped snippet.
-    """
     prefix = datetime.now().strftime("%y%m_%H%M") + "_"
-    combined = (prompt + " " + response).strip()
     info_terms = get_high_info_terms(combined)
-    # Include a short snippet from prompt and response
-    snippet = (prompt[:100] + " " + response[:100]).strip()
     snippet_cleaned = clean_text_for_filename(snippet)
-    # Combine info terms and snippet
-    # Prioritize info terms in front
     name_parts = info_terms + [snippet_cleaned]
     full_name = '_'.join(name_parts)
-    # Trim to ~150 chars
     if len(full_name) > 150:
         full_name = full_name[:150]
@@ -179,8 +195,12 @@ def generate_filename(prompt, response, file_type="md"):
 def create_file(prompt, response, file_type="md"):
     """Create file with intelligent naming"""
     filename = generate_filename(prompt.strip(), response.strip(), file_type)
     with open(filename, 'w', encoding='utf-8') as f:
-        f.write(prompt + "\n\n" + response)
     return filename
 def get_download_link(file):
@@ -189,23 +209,21 @@ def get_download_link(file):
         b64 = base64.b64encode(f.read()).decode()
     return f'<a href="data:file/zip;base64,{b64}" download="{os.path.basename(file)}">📂 Download {os.path.basename(file)}</a>'
-# 🔊 7. Audio Processing
 def clean_for_speech(text: str) -> str:
     """Clean text for speech synthesis"""
-    text = text.replace("\n", " ")
-    text = text.replace("</s>", " ")
-    text = text.replace("#", "")
     text = re.sub(r"\(https?:\/\/[^\)]+\)", "", text)
-    text = re.sub(r"\s+", " ", text).strip()
     return text
 @st.cache_resource
 def speech_synthesis_html(result):
     """Create HTML for speech synthesis"""
     html_code = f"""
     <html><body>
     <script>
-    var msg = new SpeechSynthesisUtterance("{result.replace('"', '')}");
     window.speechSynthesis.speak(msg);
     </script>
     </body></html>
@@ -235,95 +253,152 @@ def play_and_download_audio(file_path):
         dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
         st.markdown(dl_link, unsafe_allow_html=True)
-# 🎬 8. Media Processing
 def process_image(image_path, user_prompt):
     """Process image with GPT-4V"""
     with open(image_path, "rb") as imgf:
         image_data = imgf.read()
     b64img = base64.b64encode(image_data).decode("utf-8")
     resp = openai_client.chat.completions.create(
         model=st.session_state["openai_model"],
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": [
-                {"type": "text", "text": user_prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64img}"}}
             ]}
         ],
         temperature=0.0,
     )
-    return resp.choices[0].message.content
 def process_audio(audio_path):
     """Process audio with Whisper"""
     with open(audio_path, "rb") as f:
         transcription = openai_client.audio.transcriptions.create(model="whisper-1", file=f)
-    st.session_state.messages.append({"role": "user", "content": transcription.text})
-    return transcription.text
 def process_video(video_path, seconds_per_frame=1):
     """Extract frames from video"""
-    vid = cv2.VideoCapture(video_path)
-    total = int(vid.get(cv2.CAP_PROP_FRAME_COUNT))
-    fps = vid.get(cv2.CAP_PROP_FPS)
-    skip = int(fps*seconds_per_frame)
-    frames_b64 = []
-    for i in range(0, total, skip):
-        vid.set(cv2.CAP_PROP_POS_FRAMES, i)
-        ret, frame = vid.read()
-        if not ret: break
-        _, buf = cv2.imencode(".jpg", frame)
-        frames_b64.append(base64.b64encode(buf).decode("utf-8"))
-    vid.release()
-    return frames_b64
 def process_video_with_gpt(video_path, prompt):
     """Analyze video frames with GPT-4V"""
     frames = process_video(video_path)
     resp = openai_client.chat.completions.create(
         model=st.session_state["openai_model"],
         messages=[
             {"role":"system","content":"Analyze video frames."},
             {"role":"user","content":[
-                {"type":"text","text":prompt},
-                *[{"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{fr}"}} for fr in frames]
             ]}
         ]
     )
-    return resp.choices[0].message.content
-# 🤖 9. AI Model Integration
-def save_full_transcript(query, text):
-    """Save full transcript of Arxiv results as a file."""
-    create_file(query, text, "md")
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False):
     """Perform Arxiv search and generate audio summaries"""
     start = time.time()
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
-    refs = client.predict(q,20,"Semantic Search","mistralai/Mixtral-8x7B-Instruct-v0.1",api_name="/update_with_rag_md")[0]
-    r2 = client.predict(q,"mistralai/Mixtral-8x7B-Instruct-v0.1",True,api_name="/ask_llm")
-    result = f"### 🔎 {q}\n\n{r2}\n\n{refs}"
     st.markdown(result)
-    # Generate full audio version if requested
     if full_audio:
-        complete_text = f"Complete response for query: {q}. {clean_for_speech(r2)} {clean_for_speech(refs)}"
         audio_file_full = speak_with_edge_tts(complete_text)
         st.write("### 📚 Full Audio")
         play_and_download_audio(audio_file_full)
     if vocal_summary:
-        main_text = clean_for_speech(r2)
         audio_file_main = speak_with_edge_tts(main_text)
         st.write("### 🎙 Short Audio")
         play_and_download_audio(audio_file_main)
     if extended_refs:
-        summaries_text = "Extended references: " + refs.replace('"','')
         summaries_text = clean_for_speech(summaries_text)
         audio_file_refs = speak_with_edge_tts(summaries_text)
         st.write("### 📜 Long Refs")
@@ -331,7 +406,7 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
     if titles_summary:
         titles = []
-        for line in refs.split('\n'):
             m = re.search(r"\[([^\]]+)\]", line)
             if m:
                 titles.append(m.group(1))
@@ -342,50 +417,19 @@ def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary
             st.write("### 🔖 Titles")
             play_and_download_audio(audio_file_titles)
-    elapsed = time.time()-start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
-    # Always create a file with the result
-    create_file(q, result, "md")
     return result
-def process_with_gpt(text):
-    """Process text with GPT-4"""
-    if not text: return
-    st.session_state.messages.append({"role":"user","content":text})
-    with st.chat_message("user"):
-        st.markdown(text)
-    with st.chat_message("assistant"):
-        c = openai_client.chat.completions.create(
-            model=st.session_state["openai_model"],
-            messages=st.session_state.messages,
-            stream=False
-        )
-        ans = c.choices[0].message.content
-        st.write("GPT-4o: " + ans)
-        create_file(text, ans, "md")
-        st.session_state.messages.append({"role":"assistant","content":ans})
-    return ans
-def process_with_claude(text):
-    """Process text with Claude"""
-    if not text: return
-    with st.chat_message("user"):
-        st.markdown(text)
-    with st.chat_message("assistant"):
-        r = claude_client.messages.create(
-            model="claude-3-sonnet-20240229",
-            max_tokens=1000,
-            messages=[{"role":"user","content":text}]
-        )
-        ans = r.content[0].text
-        st.write("Claude-3.5: " + ans)
-        create_file(text, ans, "md")
-        st.session_state.chat_history.append({"user":text,"claude":ans})
-    return ans
-# ���� 10. File Management
 def create_zip_of_files(md_files, mp3_files):
     """Create zip with intelligent naming"""
     md_files = [f for f in md_files if os.path.basename(f).lower() != 'readme.md']
@@ -393,12 +437,13 @@ def create_zip_of_files(md_files, mp3_files):
     if not all_files:
         return None
-    # Collect content for high-info term extraction
     all_content = []
     for f in all_files:
         if f.endswith('.md'):
             with open(f, 'r', encoding='utf-8') as file:
-                all_content.append(file.read())
         elif f.endswith('.mp3'):
             all_content.append(os.path.basename(f))
@@ -409,7 +454,7 @@ def create_zip_of_files(md_files, mp3_files):
     name_text = '_'.join(term.replace(' ', '-') for term in info_terms[:3])
     zip_name = f"{timestamp}_{name_text}.zip"
-    with zipfile.ZipFile(zip_name,'w') as z:
         for f in all_files:
             z.write(f)
@@ -442,8 +487,10 @@ def extract_keywords_from_md(files):
     text = ""
     for f in files:
         if f.endswith(".md"):
-            c = open(f,'r',encoding='utf-8').read()
-            text += " " + c
     return get_high_info_terms(text)
 def display_file_manager_sidebar(groups, sorted_prefixes):
@@ -474,14 +521,14 @@ def display_file_manager_sidebar(groups, sorted_prefixes):
         if st.button("⬇️ ZipAll"):
             z = create_zip_of_files(all_md, all_mp3)
             if z:
-                st.sidebar.markdown(get_download_link(z),unsafe_allow_html=True)
     for prefix in sorted_prefixes:
         files = groups[prefix]
         kw = extract_keywords_from_md(files)
         keywords_str = " ".join(kw) if kw else "No Keywords"
         with st.sidebar.expander(f"{prefix} Files ({len(files)}) - KW: {keywords_str}", expanded=True):
-            c1,c2 = st.columns(2)
             with c1:
                 if st.button("👀ViewGrp", key="view_group_"+prefix):
                     st.session_state.viewing_prefix = prefix
@@ -497,25 +544,25 @@ def display_file_manager_sidebar(groups, sorted_prefixes):
                 ctime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
                 st.write(f"**{fname}** - {ctime}")
-# 🎯 11. Main Application
 def main():
     st.sidebar.markdown("### 🚲BikeAI🏆 Multi-Agent Research")
-    tab_main = st.radio("Action:",["🎤 Voice","📸 Media","🔍 ArXiv","📝 Editor"],horizontal=True)
     mycomponent = components.declare_component("mycomponent", path="mycomponent")
     val = mycomponent(my_input_value="Hello")
     # Show input in a text box for editing if detected
     if val:
-        val_stripped = val.replace('\n', ' ')
-        edited_input = st.text_area("✏️ Edit Input:", value=val_stripped, height=100)
         run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
         col1, col2 = st.columns(2)
         with col1:
             autorun = st.checkbox("⚙ AutoRun", value=True)
         with col2:
             full_audio = st.checkbox("📚FullAudio", value=False,
-                                     help="Generate full audio response")
         input_changed = (val != st.session_state.old_val)
@@ -523,7 +570,7 @@ def main():
             st.session_state.old_val = val
             if run_option == "Arxiv":
                 perform_ai_lookup(edited_input, vocal_summary=True, extended_refs=False,
-                                  titles_summary=True, full_audio=full_audio)
             else:
                 if run_option == "GPT-4o":
                     process_with_gpt(edited_input)
@@ -534,7 +581,7 @@ def main():
                 st.session_state.old_val = val
                 if run_option == "Arxiv":
                     perform_ai_lookup(edited_input, vocal_summary=True, extended_refs=False,
-                                      titles_summary=True, full_audio=full_audio)
                 else:
                     if run_option == "GPT-4o":
                         process_with_gpt(edited_input)
@@ -544,62 +591,70 @@ def main():
     if tab_main == "🔍 ArXiv":
         st.subheader("🔍 Query ArXiv")
         q = st.text_input("🔍 Query:")
         st.markdown("### 🎛 Options")
         vocal_summary = st.checkbox("🎙ShortAudio", value=True)
         extended_refs = st.checkbox("📜LongRefs", value=False)
         titles_summary = st.checkbox("🔖TitlesOnly", value=True)
         full_audio = st.checkbox("📚FullAudio", value=False,
-                                 help="Full audio of results")
         full_transcript = st.checkbox("🧾FullTranscript", value=False,
-                                      help="Generate a full transcript file")
         if q and st.button("🔍Run"):
-            result = perform_ai_lookup(q, vocal_summary=vocal_summary, extended_refs=extended_refs,
-                                       titles_summary=titles_summary, full_audio=full_audio)
             if full_transcript:
                 save_full_transcript(q, result)
         st.markdown("### Change Prompt & Re-Run")
         q_new = st.text_input("🔄 Modify Query:")
         if q_new and st.button("🔄 Re-Run with Modified Query"):
-            result = perform_ai_lookup(q_new, vocal_summary=vocal_summary, extended_refs=extended_refs,
-                                       titles_summary=titles_summary, full_audio=full_audio)
             if full_transcript:
                 save_full_transcript(q_new, result)
     elif tab_main == "🎤 Voice":
         st.subheader("🎤 Voice Input")
         user_text = st.text_area("💬 Message:", height=100)
-        user_text = user_text.strip().replace('\n', ' ')
         if st.button("📨 Send"):
             process_with_gpt(user_text)
         st.subheader("📜 Chat History")
-        t1,t2=st.tabs(["Claude History","GPT-4o History"])
         with t1:
             for c in st.session_state.chat_history:
-                st.write("**You:**", c["user"])
-                st.write("**Claude:**", c["claude"])
         with t2:
             for m in st.session_state.messages:
                 with st.chat_message(m["role"]):
-                    st.markdown(m["content"])
     elif tab_main == "📸 Media":
         st.header("📸 Images & 🎥 Videos")
         tabs = st.tabs(["🖼 Images", "🎥 Video"])
         with tabs[0]:
-            imgs = glob.glob("*.png")+glob.glob("*.jpg")
             if imgs:
-                c = st.slider("Cols",1,5,3)
                 cols = st.columns(c)
-                for i,f in enumerate(imgs):
                     with cols[i%c]:
-                        st.image(Image.open(f),use_container_width=True)
                         if st.button(f"👀 Analyze {os.path.basename(f)}", key=f"analyze_{f}"):
-                            a = process_image(f,"Describe this image.")
-                            st.markdown(a)
             else:
                 st.write("No images found.")
         with tabs[1]:
@@ -609,18 +664,22 @@ def main():
                     with st.expander(f"🎥 {os.path.basename(v)}"):
                         st.video(v)
                         if st.button(f"Analyze {os.path.basename(v)}", key=f"analyze_{v}"):
-                            a = process_video_with_gpt(v,"Describe video.")
-                            st.markdown(a)
             else:
                 st.write("No videos found.")
     elif tab_main == "📝 Editor":
-        if getattr(st.session_state,'current_file',None):
             st.subheader(f"Editing: {st.session_state.current_file}")
-            new_text = st.text_area("✏️ Content:", st.session_state.file_content, height=300)
             if st.button("💾 Save"):
-                with open(st.session_state.current_file,'w',encoding='utf-8') as f:
-                    f.write(new_text)
                 st.success("Updated!")
                 st.session_state.should_rerun = True
         else:
@@ -637,8 +696,9 @@ def main():
             ext = os.path.splitext(fname)[1].lower().strip('.')
             st.write(f"### {fname}")
             if ext == "md":
-                content = open(f,'r',encoding='utf-8').read()
-                st.markdown(content)
             elif ext == "mp3":
                 st.audio(f)
             else:
@@ -650,5 +710,5 @@ def main():
         st.session_state.should_rerun = False
         st.rerun()
-if __name__=="__main__":
-    main()

 )
 load_dotenv()
+# 🧠 2. Text Cleaning Functionality
+class TextCleaner:
+    """Helper class for text cleaning operations"""
+    def __init__(self):
+        self.replacements = {
+            "\\n": " ",     # Replace escaped newlines
+            "</s>": "",     # Remove end tags
+            "<s>": "",      # Remove start tags
+            "\n": " ",      # Replace actual newlines
+            "\r": " ",      # Replace carriage returns
+            "\t": " ",      # Replace tabs
+        }
+        self.preserve_replacements = {
+            "\\n": "\n",    # Convert escaped to actual newlines
+            "</s>": "",     # Remove end tags
+            "<s>": "",      # Remove start tags
+            "\r": "\n",     # Convert returns to newlines
+            "\t": "    "    # Convert tabs to spaces
+        }
+    def clean_text(self, text: str, preserve_format: bool = False) -> str:
+        """
+        Clean text removing problematic characters and normalizing whitespace.
+        Args:
+            text: Text to clean
+            preserve_format: Whether to preserve some formatting (newlines etc)
+        Returns:
+            Cleaned text string
+        """
+        if not text or not isinstance(text, str):
+            return ""
+        replacements = (self.preserve_replacements if preserve_format
+                       else self.replacements)
+        cleaned = text
+        for old, new in replacements.items():
+            cleaned = cleaned.replace(old, new)
+        # Normalize whitespace while preserving paragraphs if needed
+        if preserve_format:
+            cleaned = re.sub(r'\n{3,}', '\n\n', cleaned)
+        else:
+            cleaned = re.sub(r'\s+', ' ', cleaned)
+        return cleaned.strip()
+    def clean_dict(self, data: dict, fields: list) -> dict:
+        """Clean specified fields in a dictionary"""
+        if not data or not isinstance(data, dict):
+            return {}
+        cleaned = data.copy()
+        for field in fields:
+            if field in cleaned:
+                cleaned[field] = self.clean_text(cleaned[field])
+        return cleaned
+    def clean_list(self, items: list, fields: list) -> list:
+        """Clean specified fields in a list of dictionaries"""
+        if not isinstance(items, list):
+            return []
+        return [self.clean_dict(item, fields) for item in items]
+# Initialize cleaner
+cleaner = TextCleaner()
+# 🔑 3. API Setup & Clients
 openai_api_key = os.getenv('OPENAI_API_KEY', "")
 anthropic_key = os.getenv('ANTHROPIC_API_KEY_3', "")
 xai_key = os.getenv('xai',"")
 HF_KEY = os.getenv('HF_KEY')
 API_URL = os.getenv('API_URL')
+# 📝 4. Session State Management
 if 'transcript_history' not in st.session_state:
     st.session_state['transcript_history'] = []
 if 'chat_history' not in st.session_state:
     st.session_state['chat_history'] = []
 if 'openai_model' not in st.session_state:
+    st.session_state['openai_model'] = "gpt-4-1106-preview"
 if 'messages' not in st.session_state:
     st.session_state['messages'] = []
 if 'last_voice_input' not in st.session_state:
     st.session_state['edit_new_name'] = ""
 if 'edit_new_content' not in st.session_state:
     st.session_state['edit_new_content'] = ""
+if 'viewing_prefix' not in st.session_state:
     st.session_state['viewing_prefix'] = None
 if 'should_rerun' not in st.session_state:
     st.session_state['should_rerun'] = False
 if 'old_val' not in st.session_state:
     st.session_state['old_val'] = None
+# 🎨 5. Custom CSS
 st.markdown("""
 <style>
     .main { background: linear-gradient(to right, #1a1a1a, #2d2d2d); color: #fff; }
     .stMarkdown { font-family: 'Helvetica Neue', sans-serif; }
+    .stButton>button { margin-right: 0.5rem; }
 </style>
 """, unsafe_allow_html=True)
     "mp3": "🎵",
 }
+# 🧠 6. High-Information Content Extraction
 def get_high_info_terms(text: str) -> list:
     """Extract high-information terms from text, including key phrases."""
+    text = cleaner.clean_text(text)
+    # ... rest of function remains the same ...
+    [Your existing get_high_info_terms implementation]
 def clean_text_for_filename(text: str) -> str:
     """Remove punctuation and short filler words, return a compact string."""
+    text = cleaner.clean_text(text)
+    # ... rest of function remains the same ...
+    [Your existing clean_text_for_filename implementation]
+# 📁 7. File Operations
 def generate_filename(prompt, response, file_type="md"):
+    """Generate filename with meaningful terms."""
+    cleaned_prompt = cleaner.clean_text(prompt)
+    cleaned_response = cleaner.clean_text(response)
     prefix = datetime.now().strftime("%y%m_%H%M") + "_"
+    combined = (cleaned_prompt + " " + cleaned_response).strip()
     info_terms = get_high_info_terms(combined)
+    snippet = (cleaned_prompt[:100] + " " + cleaned_response[:100]).strip()
     snippet_cleaned = clean_text_for_filename(snippet)
     name_parts = info_terms + [snippet_cleaned]
     full_name = '_'.join(name_parts)
     if len(full_name) > 150:
         full_name = full_name[:150]
 def create_file(prompt, response, file_type="md"):
     """Create file with intelligent naming"""
     filename = generate_filename(prompt.strip(), response.strip(), file_type)
+    cleaned_prompt = cleaner.clean_text(prompt)
+    cleaned_response = cleaner.clean_text(response, preserve_format=True)
     with open(filename, 'w', encoding='utf-8') as f:
+        f.write(cleaned_prompt + "\n\n" + cleaned_response)
     return filename
 def get_download_link(file):
         b64 = base64.b64encode(f.read()).decode()
     return f'<a href="data:file/zip;base64,{b64}" download="{os.path.basename(file)}">📂 Download {os.path.basename(file)}</a>'
+# 🔊 8. Audio Processing
 def clean_for_speech(text: str) -> str:
     """Clean text for speech synthesis"""
+    text = cleaner.clean_text(text)
     text = re.sub(r"\(https?:\/\/[^\)]+\)", "", text)
     return text
 @st.cache_resource
 def speech_synthesis_html(result):
     """Create HTML for speech synthesis"""
+    cleaned_result = clean_for_speech(result)
     html_code = f"""
     <html><body>
     <script>
+    var msg = new SpeechSynthesisUtterance("{cleaned_result.replace('"', '')}");
     window.speechSynthesis.speak(msg);
     </script>
     </body></html>
         dl_link = f'<a href="data:audio/mpeg;base64,{base64.b64encode(open(file_path,"rb").read()).decode()}" download="{os.path.basename(file_path)}">Download {os.path.basename(file_path)}</a>'
         st.markdown(dl_link, unsafe_allow_html=True)
+# 🎬 9. Media Processing
 def process_image(image_path, user_prompt):
     """Process image with GPT-4V"""
     with open(image_path, "rb") as imgf:
         image_data = imgf.read()
     b64img = base64.b64encode(image_data).decode("utf-8")
+    cleaned_prompt = cleaner.clean_text(user_prompt)
     resp = openai_client.chat.completions.create(
         model=st.session_state["openai_model"],
         messages=[
             {"role": "system", "content": "You are a helpful assistant."},
             {"role": "user", "content": [
+                {"type": "text", "text": cleaned_prompt},
                 {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{b64img}"}}
             ]}
         ],
         temperature=0.0,
     )
+    return cleaner.clean_text(resp.choices[0].message.content, preserve_format=True)
 def process_audio(audio_path):
     """Process audio with Whisper"""
     with open(audio_path, "rb") as f:
         transcription = openai_client.audio.transcriptions.create(model="whisper-1", file=f)
+    cleaned_text = cleaner.clean_text(transcription.text)
+    st.session_state.messages.append({
+        "role": "user",
+        "content": cleaned_text
+    })
+    return cleaned_text
 def process_video(video_path, seconds_per_frame=1):
     """Extract frames from video"""
+    # ... function remains the same as it handles binary data ...
+    [Your existing process_video implementation]
 def process_video_with_gpt(video_path, prompt):
     """Analyze video frames with GPT-4V"""
     frames = process_video(video_path)
+    cleaned_prompt = cleaner.clean_text(prompt)
     resp = openai_client.chat.completions.create(
         model=st.session_state["openai_model"],
         messages=[
             {"role":"system","content":"Analyze video frames."},
             {"role":"user","content":[
+                {"type":"text","text":cleaned_prompt},
+                *[{"type":"image_url","image_url":{"url":f"data:image/jpeg;base64,{fr}"}}
+                  for fr in frames]
             ]}
         ]
     )
+    return cleaner.clean_text(resp.choices[0].message.content, preserve_format=True)
+# 🤖 10. AI Model Integration
+def process_with_claude(text):
+    """Process text with Claude"""
+    if not text: return
+    cleaned_input = cleaner.clean_text(text)
+    with st.chat_message("user"):
+        st.markdown(cleaned_input)
+    with st.chat_message("assistant"):
+        r = claude_client.messages.create(
+            model="claude-3-sonnet-20240229",
+            max_tokens=1000,
+            messages=[{"role":"user","content":cleaned_input}]
+        )
+        raw_response = r.content[0].text
+        cleaned_response = cleaner.clean_text(raw_response, preserve_format=True)
+        st.write("Claude-3.5: " + cleaned_response)
+        create_file(cleaned_input, cleaned_response, "md")
+        st.session_state.chat_history.append({
+            "user": cleaned_input,
+            "claude": cleaned_response
+        })
+    return cleaned_response
+def process_with_gpt(text):
+    """Process text with GPT-4"""
+    if not text: return
+    cleaned_input = cleaner.clean_text(text)
+    st.session_state.messages.append({
+        "role": "user",
+        "content": cleaned_input
+    })
+    with st.chat_message("user"):
+        st.markdown(cleaned_input)
+    with st.chat_message("assistant"):
+        c = openai_client.chat.completions.create(
+            model=st.session_state["openai_model"],
+            messages=st.session_state.messages,
+            stream=False
+        )
+        raw_response = c.choices[0].message.content
+        cleaned_response = cleaner.clean_text(raw_response, preserve_format=True)
+        st.write("GPT-4o: " + cleaned_response)
+        create_file(cleaned_input, cleaned_response, "md")
+        st.session_state.messages.append({
+            "role": "assistant",
+            "content": cleaned_response
+        })
+    return cleaned_response
 def perform_ai_lookup(q, vocal_summary=True, extended_refs=False, titles_summary=True, full_audio=False):
     """Perform Arxiv search and generate audio summaries"""
+    cleaned_query = cleaner.clean_text(q)
     start = time.time()
     client = Client("awacke1/Arxiv-Paper-Search-And-QA-RAG-Pattern")
+    refs = client.predict(cleaned_query, 20, "Semantic Search",
+                         "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                         api_name="/update_with_rag_md")[0]
+    r2 = client.predict(cleaned_query, "mistralai/Mixtral-8x7B-Instruct-v0.1",
+                       True, api_name="/ask_llm")
+    # Clean responses
+    cleaned_r2 = cleaner.clean_text(r2, preserve_format=True)
+    cleaned_refs = cleaner.clean_text(refs, preserve_format=True)
+    result = f"### 🔎 {cleaned_query}\n\n{cleaned_r2}\n\n{cleaned_refs}"
     st.markdown(result)
     if full_audio:
+        complete_text = f"Complete response for query: {cleaned_query}. {clean_for_speech(cleaned_r2)} {clean_for_speech(cleaned_refs)}"
         audio_file_full = speak_with_edge_tts(complete_text)
         st.write("### 📚 Full Audio")
         play_and_download_audio(audio_file_full)
     if vocal_summary:
+        main_text = clean_for_speech(cleaned_r2)
         audio_file_main = speak_with_edge_tts(main_text)
         st.write("### 🎙 Short Audio")
         play_and_download_audio(audio_file_main)
     if extended_refs:
+        summaries_text = "Extended references: " + cleaned_refs.replace('"','')
         summaries_text = clean_for_speech(summaries_text)
         audio_file_refs = speak_with_edge_tts(summaries_text)
         st.write("### 📜 Long Refs")
     if titles_summary:
         titles = []
+        for line in cleaned_refs.split('\n'):
             m = re.search(r"\[([^\]]+)\]", line)
             if m:
                 titles.append(m.group(1))
             st.write("### 🔖 Titles")
             play_and_download_audio(audio_file_titles)
+    elapsed = time.time() - start
     st.write(f"**Total Elapsed:** {elapsed:.2f} s")
+    create_file(cleaned_query, result, "md")
     return result
+def save_full_transcript(query, text):
+    """Save full transcript of results as a file."""
+    cleaned_query = cleaner.clean_text(query)
+    cleaned_text = cleaner.clean_text(text, preserve_format=True)
+    create_file(cleaned_query, cleaned_text, "md")
+# 📂 11. File Management
 def create_zip_of_files(md_files, mp3_files):
     """Create zip with intelligent naming"""
     md_files = [f for f in md_files if os.path.basename(f).lower() != 'readme.md']
     if not all_files:
         return None
     all_content = []
     for f in all_files:
         if f.endswith('.md'):
             with open(f, 'r', encoding='utf-8') as file:
+                content = file.read()
+                cleaned_content = cleaner.clean_text(content)
+                all_content.append(cleaned_content)
         elif f.endswith('.mp3'):
             all_content.append(os.path.basename(f))
     name_text = '_'.join(term.replace(' ', '-') for term in info_terms[:3])
     zip_name = f"{timestamp}_{name_text}.zip"
+    with zipfile.ZipFile(zip_name, 'w') as z:
         for f in all_files:
             z.write(f)
     text = ""
     for f in files:
         if f.endswith(".md"):
+            with open(f, 'r', encoding='utf-8') as file:
+                content = file.read()
+                cleaned_content = cleaner.clean_text(content)
+                text += " " + cleaned_content
     return get_high_info_terms(text)
 def display_file_manager_sidebar(groups, sorted_prefixes):
         if st.button("⬇️ ZipAll"):
             z = create_zip_of_files(all_md, all_mp3)
             if z:
+                st.sidebar.markdown(get_download_link(z), unsafe_allow_html=True)
     for prefix in sorted_prefixes:
         files = groups[prefix]
         kw = extract_keywords_from_md(files)
         keywords_str = " ".join(kw) if kw else "No Keywords"
         with st.sidebar.expander(f"{prefix} Files ({len(files)}) - KW: {keywords_str}", expanded=True):
+            c1, c2 = st.columns(2)
             with c1:
                 if st.button("👀ViewGrp", key="view_group_"+prefix):
                     st.session_state.viewing_prefix = prefix
                 ctime = datetime.fromtimestamp(os.path.getmtime(f)).strftime("%Y-%m-%d %H:%M:%S")
                 st.write(f"**{fname}** - {ctime}")
+# 🎯 12. Main Application
 def main():
     st.sidebar.markdown("### 🚲BikeAI🏆 Multi-Agent Research")
+    tab_main = st.radio("Action:", ["🎤 Voice", "📸 Media", "🔍 ArXiv", "📝 Editor"], horizontal=True)
     mycomponent = components.declare_component("mycomponent", path="mycomponent")
     val = mycomponent(my_input_value="Hello")
     # Show input in a text box for editing if detected
     if val:
+        cleaned_val = cleaner.clean_text(val)
+        edited_input = st.text_area("✏️ Edit Input:", value=cleaned_val, height=100)
         run_option = st.selectbox("Model:", ["Arxiv", "GPT-4o", "Claude-3.5"])
         col1, col2 = st.columns(2)
         with col1:
             autorun = st.checkbox("⚙ AutoRun", value=True)
         with col2:
             full_audio = st.checkbox("📚FullAudio", value=False,
+                                   help="Generate full audio response")
         input_changed = (val != st.session_state.old_val)
             st.session_state.old_val = val
             if run_option == "Arxiv":
                 perform_ai_lookup(edited_input, vocal_summary=True, extended_refs=False,
+                                titles_summary=True, full_audio=full_audio)
             else:
                 if run_option == "GPT-4o":
                     process_with_gpt(edited_input)
                 st.session_state.old_val = val
                 if run_option == "Arxiv":
                     perform_ai_lookup(edited_input, vocal_summary=True, extended_refs=False,
+                                    titles_summary=True, full_audio=full_audio)
                 else:
                     if run_option == "GPT-4o":
                         process_with_gpt(edited_input)
     if tab_main == "🔍 ArXiv":
         st.subheader("🔍 Query ArXiv")
         q = st.text_input("🔍 Query:")
+        q = cleaner.clean_text(q)
         st.markdown("### 🎛 Options")
         vocal_summary = st.checkbox("🎙ShortAudio", value=True)
         extended_refs = st.checkbox("📜LongRefs", value=False)
         titles_summary = st.checkbox("🔖TitlesOnly", value=True)
         full_audio = st.checkbox("📚FullAudio", value=False,
+                               help="Generate full audio response")
         full_transcript = st.checkbox("🧾FullTranscript", value=False,
+                                    help="Generate a full transcript file")
         if q and st.button("🔍Run"):
+            result = perform_ai_lookup(q, vocal_summary=vocal_summary,
+                                     extended_refs=extended_refs,
+                                     titles_summary=titles_summary,
+                                     full_audio=full_audio)
             if full_transcript:
                 save_full_transcript(q, result)
         st.markdown("### Change Prompt & Re-Run")
         q_new = st.text_input("🔄 Modify Query:")
+        q_new = cleaner.clean_text(q_new)
         if q_new and st.button("🔄 Re-Run with Modified Query"):
+            result = perform_ai_lookup(q_new, vocal_summary=vocal_summary,
+                                     extended_refs=extended_refs,
+                                     titles_summary=titles_summary,
+                                     full_audio=full_audio)
             if full_transcript:
                 save_full_transcript(q_new, result)
     elif tab_main == "🎤 Voice":
         st.subheader("🎤 Voice Input")
         user_text = st.text_area("💬 Message:", height=100)
+        user_text = cleaner.clean_text(user_text)
         if st.button("📨 Send"):
             process_with_gpt(user_text)
         st.subheader("📜 Chat History")
+        t1, t2 = st.tabs(["Claude History", "GPT-4o History"])
         with t1:
             for c in st.session_state.chat_history:
+                st.write("**You:**", cleaner.clean_text(c["user"]))
+                st.write("**Claude:**", cleaner.clean_text(c["claude"], preserve_format=True))
         with t2:
             for m in st.session_state.messages:
                 with st.chat_message(m["role"]):
+                    if m["role"] == "user":
+                        st.markdown(cleaner.clean_text(m["content"]))
+                    else:
+                        st.markdown(cleaner.clean_text(m["content"], preserve_format=True))
     elif tab_main == "📸 Media":
         st.header("📸 Images & 🎥 Videos")
         tabs = st.tabs(["🖼 Images", "🎥 Video"])
         with tabs[0]:
+            imgs = glob.glob("*.png") + glob.glob("*.jpg")
             if imgs:
+                c = st.slider("Cols", 1, 5, 3)
                 cols = st.columns(c)
+                for i, f in enumerate(imgs):
                     with cols[i%c]:
+                        st.image(Image.open(f), use_container_width=True)
                         if st.button(f"👀 Analyze {os.path.basename(f)}", key=f"analyze_{f}"):
+                            a = process_image(f, "Describe this image.")
+                            st.markdown(cleaner.clean_text(a, preserve_format=True))
             else:
                 st.write("No images found.")
         with tabs[1]:
                     with st.expander(f"🎥 {os.path.basename(v)}"):
                         st.video(v)
                         if st.button(f"Analyze {os.path.basename(v)}", key=f"analyze_{v}"):
+                            a = process_video_with_gpt(v, "Describe video.")
+                            st.markdown(cleaner.clean_text(a, preserve_format=True))
             else:
                 st.write("No videos found.")
     elif tab_main == "📝 Editor":
+        if getattr(st.session_state, 'current_file', None):
             st.subheader(f"Editing: {st.session_state.current_file}")
+            with open(st.session_state.current_file, 'r', encoding='utf-8') as f:
+                content = f.read()
+            content = cleaner.clean_text(content, preserve_format=True)
+            new_text = st.text_area("✏️ Content:", content, height=300)
             if st.button("💾 Save"):
+                cleaned_content = cleaner.clean_text(new_text, preserve_format=True)
+                with open(st.session_state.current_file, 'w', encoding='utf-8') as f:
+                    f.write(cleaned_content)
                 st.success("Updated!")
                 st.session_state.should_rerun = True
         else:
             ext = os.path.splitext(fname)[1].lower().strip('.')
             st.write(f"### {fname}")
             if ext == "md":
+                with open(f, 'r', encoding='utf-8') as file:
+                    content = file.read()
+                st.markdown(cleaner.clean_text(content, preserve_format=True))
             elif ext == "mp3":
                 st.audio(f)
             else:
         st.session_state.should_rerun = False
         st.rerun()
+if __name__ == "__main__":
+    main()