Spaces:

borodache
/

science-classification

Sleeping

App Files Files Community

borodache commited on May 3, 2024

Commit

1afb34d

1 Parent(s): 58148f9

commit to change the repository to include already trained model and pipeline

Browse files

Files changed (11) hide show

.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/science-classification.iml +14 -0
.idea/vcs.xml +6 -0
.idea/workspace.xml +13 -1
app.py +17 -48
model_linear_svc.joblib +3 -0
requirements.txt +1 -3
tfidf_vectorizer.joblib +3 -0
utils.py +22 -0

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (science-classification)" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/science-classification.iml" filepath="$PROJECT_DIR$/.idea/science-classification.iml" />
+    </modules>
+  </component>
+</project>

.idea/science-classification.iml ADDED Viewed

	@@ -0,0 +1,14 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$">
+      <excludeFolder url="file://$MODULE_DIR$/venv" />
+    </content>
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="PyDocumentationSettings">
+    <option name="format" value="PLAIN" />
+    <option name="myDocStringFormat" value="Plain" />
+  </component>
+</module>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="$PROJECT_DIR$" vcs="Git" />
+  </component>
+</project>

.idea/workspace.xml CHANGED Viewed

@@ -1,7 +1,14 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ChangeListManager">
-    <list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment="" />
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -31,6 +38,11 @@
     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
   </component>
   <component name="RunManager">
     <configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
       <option name="INTERPRETER_OPTIONS" value="" />

 <?xml version="1.0" encoding="UTF-8"?>
 <project version="4">
   <component name="ChangeListManager">
+    <list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment="">
+      <change afterPath="$PROJECT_DIR$/model_linear_svc.joblib" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/tfidf_vectorizer.joblib" afterDir="false" />
+      <change afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
+      <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
+    </list>
     <option name="SHOW_DIALOG" value="false" />
     <option name="HIGHLIGHT_CONFLICTS" value="true" />
     <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
     <property name="last_opened_file_path" value="$PROJECT_DIR$" />
     <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
   </component>
+  <component name="RecentsManager">
+    <key name="CopyFile.RECENT_KEYS">
+      <recent name="C:\Users\borod\OneDrive\Documents\Projects\GitHub\science-classification" />
+    </key>
+  </component>
   <component name="RunManager">
     <configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
       <option name="INTERPRETER_OPTIONS" value="" />

app.py CHANGED Viewed

@@ -1,59 +1,28 @@
 import gradio as gr
-from re import sub
-from datasets import load_dataset
-import pandas as pd
-from sklearn.feature_extraction.text import TfidfVectorizer
-from sklearn.svm import LinearSVC
-def clean_text(text):
-    """
-    Applies some pre-processing on the given text.
-    Steps :
-    - Lowering text
-    - Removing backslashes
-    - removes a. out of the answers
-    - replaces 'b.', 'c.', and 'd.' with comma ','
-    """
-    text = text.lower()
-    text = text.replace('\\', '')  # generic replace was advised by Danit
-    text = text.replace('a.', '')
-    text = text.replace('b.', ',')
-    text = text.replace('c.', ',')
-    text = text.replace('d.', ',')
-    text = sub("\d+", "<num>", text)
-    return ' '.join(text.split())
-dataset = load_dataset(
-    "csv",
-    data_files={
-        "train": "https://huggingface.co/borodache/science-classification-data/resolve/main/subjects-questions.csv"
-    },
-)
-df = dataset["train"]
-# prepare data
-df = df[df['category'].isin(['Physics', 'Chemistry', 'Maths', 'Biology'])]
-questions = df['question']
-new_questions = []
-for question in questions:
-    new_questions.append(clean_text(question))
-X = new_questions
-y = df['category']
-tfidf_vectorizer = TfidfVectorizer(stop_words="english",
-                                   preprocessor=clean_text,
-                                   ngram_range=(1, 2))
-features_tfidf = tfidf_vectorizer.fit_transform(X)
-# Training
-model = LinearSVC()
-model.fit(features_tfidf, y)
 def classify(sentence):

 import gradio as gr
+from sklearn.pipeline import Pipeline
+import joblib
+class CustomTextClassificationPipeline(Pipeline):
+    def __init__(self):
+        tfidf_vectorizer = joblib.load("tfidf_vectorizer.joblib")
+        linear_svc = joblib.load("model_linear_svc.joblib")
+        super().__init__([
+            ('tfidf', tfidf_vectorizer),
+            ('classifier', linear_svc)
+        ])
+    def predict(self, text):
+        # Call the parent predict method to get the list of predicted labels
+        y_pred_list = super().predict([text])
+        # Convert the list to a string by taking the first element
+        y_pred_str = str(y_pred_list[0])
+        return y_pred_str
+model = CustomTextClassificationPipeline()
 def classify(sentence):

model_linear_svc.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d7b049ab18b3d50cc79374c0ab9b544ad18807b063c65f82c9fcd62709b0a1fc
+size 17757920

requirements.txt CHANGED Viewed

@@ -1,5 +1,3 @@
 gradio
-datasets
-pandas
-huggingface_hub
 scikit-learn

 gradio
 scikit-learn
+joblib

tfidf_vectorizer.joblib ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ceb0a87b45261e3cb093b72d2c565dea618c7bd63b9e6f6c16cab1b9a4906d53
+size 24881984

utils.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from re import sub
+def clean_text(text):
+    """
+    Applies some pre-processing on the given text.
+    Steps :
+    - Lowering text
+    - Removing backslashes
+    - removes a. out of the answers
+    - replaces 'b.', 'c.', and 'd.' with comma ','
+    """
+    text = text.lower()
+    text = text.replace('\\', '')  # generic replace was advised by Danit
+    text = text.replace('a.', '')
+    text = text.replace('b.', ',')
+    text = text.replace('c.', ',')
+    text = text.replace('d.', ',')
+    text = sub("\d+", "<num>", text)
+    return ' '.join(text.split())