Spaces:
Sleeping
Sleeping
commit to change the repository to include already trained model and pipeline
Browse files- .idea/inspectionProfiles/profiles_settings.xml +6 -0
- .idea/misc.xml +4 -0
- .idea/modules.xml +8 -0
- .idea/science-classification.iml +14 -0
- .idea/vcs.xml +6 -0
- .idea/workspace.xml +13 -1
- app.py +17 -48
- model_linear_svc.joblib +3 -0
- requirements.txt +1 -3
- tfidf_vectorizer.joblib +3 -0
- utils.py +22 -0
.idea/inspectionProfiles/profiles_settings.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<component name="InspectionProjectProfileManager">
|
2 |
+
<settings>
|
3 |
+
<option name="USE_PROJECT_PROFILE" value="false" />
|
4 |
+
<version value="1.0" />
|
5 |
+
</settings>
|
6 |
+
</component>
|
.idea/misc.xml
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (science-classification)" project-jdk-type="Python SDK" />
|
4 |
+
</project>
|
.idea/modules.xml
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="ProjectModuleManager">
|
4 |
+
<modules>
|
5 |
+
<module fileurl="file://$PROJECT_DIR$/.idea/science-classification.iml" filepath="$PROJECT_DIR$/.idea/science-classification.iml" />
|
6 |
+
</modules>
|
7 |
+
</component>
|
8 |
+
</project>
|
.idea/science-classification.iml
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<module type="PYTHON_MODULE" version="4">
|
3 |
+
<component name="NewModuleRootManager">
|
4 |
+
<content url="file://$MODULE_DIR$">
|
5 |
+
<excludeFolder url="file://$MODULE_DIR$/venv" />
|
6 |
+
</content>
|
7 |
+
<orderEntry type="inheritedJdk" />
|
8 |
+
<orderEntry type="sourceFolder" forTests="false" />
|
9 |
+
</component>
|
10 |
+
<component name="PyDocumentationSettings">
|
11 |
+
<option name="format" value="PLAIN" />
|
12 |
+
<option name="myDocStringFormat" value="Plain" />
|
13 |
+
</component>
|
14 |
+
</module>
|
.idea/vcs.xml
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
+
<project version="4">
|
3 |
+
<component name="VcsDirectoryMappings">
|
4 |
+
<mapping directory="$PROJECT_DIR$" vcs="Git" />
|
5 |
+
</component>
|
6 |
+
</project>
|
.idea/workspace.xml
CHANGED
@@ -1,7 +1,14 @@
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
<component name="ChangeListManager">
|
4 |
-
<list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment=""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
5 |
<option name="SHOW_DIALOG" value="false" />
|
6 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
7 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
@@ -31,6 +38,11 @@
|
|
31 |
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
32 |
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
33 |
</component>
|
|
|
|
|
|
|
|
|
|
|
34 |
<component name="RunManager">
|
35 |
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
36 |
<option name="INTERPRETER_OPTIONS" value="" />
|
|
|
1 |
<?xml version="1.0" encoding="UTF-8"?>
|
2 |
<project version="4">
|
3 |
<component name="ChangeListManager">
|
4 |
+
<list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment="">
|
5 |
+
<change afterPath="$PROJECT_DIR$/model_linear_svc.joblib" afterDir="false" />
|
6 |
+
<change afterPath="$PROJECT_DIR$/tfidf_vectorizer.joblib" afterDir="false" />
|
7 |
+
<change afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
|
8 |
+
<change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
|
9 |
+
<change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
|
10 |
+
<change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
|
11 |
+
</list>
|
12 |
<option name="SHOW_DIALOG" value="false" />
|
13 |
<option name="HIGHLIGHT_CONFLICTS" value="true" />
|
14 |
<option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
|
|
|
38 |
<property name="last_opened_file_path" value="$PROJECT_DIR$" />
|
39 |
<property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
|
40 |
</component>
|
41 |
+
<component name="RecentsManager">
|
42 |
+
<key name="CopyFile.RECENT_KEYS">
|
43 |
+
<recent name="C:\Users\borod\OneDrive\Documents\Projects\GitHub\science-classification" />
|
44 |
+
</key>
|
45 |
+
</component>
|
46 |
<component name="RunManager">
|
47 |
<configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
|
48 |
<option name="INTERPRETER_OPTIONS" value="" />
|
app.py
CHANGED
@@ -1,59 +1,28 @@
|
|
1 |
import gradio as gr
|
2 |
-
from
|
3 |
-
|
4 |
-
import pandas as pd
|
5 |
-
from sklearn.feature_extraction.text import TfidfVectorizer
|
6 |
-
from sklearn.svm import LinearSVC
|
7 |
|
8 |
|
9 |
-
|
10 |
-
|
11 |
-
|
|
|
|
|
|
|
|
|
|
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
- removes a. out of the answers
|
17 |
-
- replaces 'b.', 'c.', and 'd.' with comma ','
|
18 |
-
"""
|
19 |
-
text = text.lower()
|
20 |
-
text = text.replace('\\', '') # generic replace was advised by Danit
|
21 |
-
text = text.replace('a.', '')
|
22 |
-
text = text.replace('b.', ',')
|
23 |
-
text = text.replace('c.', ',')
|
24 |
-
text = text.replace('d.', ',')
|
25 |
-
text = sub("\d+", "<num>", text)
|
26 |
|
27 |
-
|
|
|
28 |
|
|
|
29 |
|
30 |
-
dataset = load_dataset(
|
31 |
-
"csv",
|
32 |
-
data_files={
|
33 |
-
"train": "https://huggingface.co/borodache/science-classification-data/resolve/main/subjects-questions.csv"
|
34 |
-
},
|
35 |
-
)
|
36 |
|
37 |
-
|
38 |
-
|
39 |
-
# prepare data
|
40 |
-
df = df[df['category'].isin(['Physics', 'Chemistry', 'Maths', 'Biology'])]
|
41 |
-
questions = df['question']
|
42 |
-
new_questions = []
|
43 |
-
for question in questions:
|
44 |
-
new_questions.append(clean_text(question))
|
45 |
-
|
46 |
-
X = new_questions
|
47 |
-
y = df['category']
|
48 |
-
tfidf_vectorizer = TfidfVectorizer(stop_words="english",
|
49 |
-
preprocessor=clean_text,
|
50 |
-
ngram_range=(1, 2))
|
51 |
-
|
52 |
-
features_tfidf = tfidf_vectorizer.fit_transform(X)
|
53 |
-
|
54 |
-
# Training
|
55 |
-
model = LinearSVC()
|
56 |
-
model.fit(features_tfidf, y)
|
57 |
|
58 |
|
59 |
def classify(sentence):
|
|
|
1 |
import gradio as gr
|
2 |
+
from sklearn.pipeline import Pipeline
|
3 |
+
import joblib
|
|
|
|
|
|
|
4 |
|
5 |
|
6 |
+
class CustomTextClassificationPipeline(Pipeline):
|
7 |
+
def __init__(self):
|
8 |
+
tfidf_vectorizer = joblib.load("tfidf_vectorizer.joblib")
|
9 |
+
linear_svc = joblib.load("model_linear_svc.joblib")
|
10 |
+
super().__init__([
|
11 |
+
('tfidf', tfidf_vectorizer),
|
12 |
+
('classifier', linear_svc)
|
13 |
+
])
|
14 |
|
15 |
+
def predict(self, text):
|
16 |
+
# Call the parent predict method to get the list of predicted labels
|
17 |
+
y_pred_list = super().predict([text])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
18 |
|
19 |
+
# Convert the list to a string by taking the first element
|
20 |
+
y_pred_str = str(y_pred_list[0])
|
21 |
|
22 |
+
return y_pred_str
|
23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
24 |
|
25 |
+
model = CustomTextClassificationPipeline()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
26 |
|
27 |
|
28 |
def classify(sentence):
|
model_linear_svc.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d7b049ab18b3d50cc79374c0ab9b544ad18807b063c65f82c9fcd62709b0a1fc
|
3 |
+
size 17757920
|
requirements.txt
CHANGED
@@ -1,5 +1,3 @@
|
|
1 |
gradio
|
2 |
-
datasets
|
3 |
-
pandas
|
4 |
-
huggingface_hub
|
5 |
scikit-learn
|
|
|
|
1 |
gradio
|
|
|
|
|
|
|
2 |
scikit-learn
|
3 |
+
joblib
|
tfidf_vectorizer.joblib
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ceb0a87b45261e3cb093b72d2c565dea618c7bd63b9e6f6c16cab1b9a4906d53
|
3 |
+
size 24881984
|
utils.py
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from re import sub
|
2 |
+
|
3 |
+
|
4 |
+
def clean_text(text):
|
5 |
+
"""
|
6 |
+
Applies some pre-processing on the given text.
|
7 |
+
|
8 |
+
Steps :
|
9 |
+
- Lowering text
|
10 |
+
- Removing backslashes
|
11 |
+
- removes a. out of the answers
|
12 |
+
- replaces 'b.', 'c.', and 'd.' with comma ','
|
13 |
+
"""
|
14 |
+
text = text.lower()
|
15 |
+
text = text.replace('\\', '') # generic replace was advised by Danit
|
16 |
+
text = text.replace('a.', '')
|
17 |
+
text = text.replace('b.', ',')
|
18 |
+
text = text.replace('c.', ',')
|
19 |
+
text = text.replace('d.', ',')
|
20 |
+
text = sub("\d+", "<num>", text)
|
21 |
+
|
22 |
+
return ' '.join(text.split())
|