borodache commited on
Commit
1afb34d
·
1 Parent(s): 58148f9

commit to change the repository to include already trained model and pipeline

Browse files
.idea/inspectionProfiles/profiles_settings.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <component name="InspectionProjectProfileManager">
2
+ <settings>
3
+ <option name="USE_PROJECT_PROFILE" value="false" />
4
+ <version value="1.0" />
5
+ </settings>
6
+ </component>
.idea/misc.xml ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.7 (science-classification)" project-jdk-type="Python SDK" />
4
+ </project>
.idea/modules.xml ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="ProjectModuleManager">
4
+ <modules>
5
+ <module fileurl="file://$PROJECT_DIR$/.idea/science-classification.iml" filepath="$PROJECT_DIR$/.idea/science-classification.iml" />
6
+ </modules>
7
+ </component>
8
+ </project>
.idea/science-classification.iml ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <module type="PYTHON_MODULE" version="4">
3
+ <component name="NewModuleRootManager">
4
+ <content url="file://$MODULE_DIR$">
5
+ <excludeFolder url="file://$MODULE_DIR$/venv" />
6
+ </content>
7
+ <orderEntry type="inheritedJdk" />
8
+ <orderEntry type="sourceFolder" forTests="false" />
9
+ </component>
10
+ <component name="PyDocumentationSettings">
11
+ <option name="format" value="PLAIN" />
12
+ <option name="myDocStringFormat" value="Plain" />
13
+ </component>
14
+ </module>
.idea/vcs.xml ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ <?xml version="1.0" encoding="UTF-8"?>
2
+ <project version="4">
3
+ <component name="VcsDirectoryMappings">
4
+ <mapping directory="$PROJECT_DIR$" vcs="Git" />
5
+ </component>
6
+ </project>
.idea/workspace.xml CHANGED
@@ -1,7 +1,14 @@
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
  <component name="ChangeListManager">
4
- <list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment="" />
 
 
 
 
 
 
 
5
  <option name="SHOW_DIALOG" value="false" />
6
  <option name="HIGHLIGHT_CONFLICTS" value="true" />
7
  <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
@@ -31,6 +38,11 @@
31
  <property name="last_opened_file_path" value="$PROJECT_DIR$" />
32
  <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
33
  </component>
 
 
 
 
 
34
  <component name="RunManager">
35
  <configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
36
  <option name="INTERPRETER_OPTIONS" value="" />
 
1
  <?xml version="1.0" encoding="UTF-8"?>
2
  <project version="4">
3
  <component name="ChangeListManager">
4
+ <list default="true" id="c9ff8b7d-c1c3-4125-8f84-0a49b7eeea0d" name="Changes" comment="">
5
+ <change afterPath="$PROJECT_DIR$/model_linear_svc.joblib" afterDir="false" />
6
+ <change afterPath="$PROJECT_DIR$/tfidf_vectorizer.joblib" afterDir="false" />
7
+ <change afterPath="$PROJECT_DIR$/utils.py" afterDir="false" />
8
+ <change beforePath="$PROJECT_DIR$/.idea/workspace.xml" beforeDir="false" afterPath="$PROJECT_DIR$/.idea/workspace.xml" afterDir="false" />
9
+ <change beforePath="$PROJECT_DIR$/app.py" beforeDir="false" afterPath="$PROJECT_DIR$/app.py" afterDir="false" />
10
+ <change beforePath="$PROJECT_DIR$/requirements.txt" beforeDir="false" afterPath="$PROJECT_DIR$/requirements.txt" afterDir="false" />
11
+ </list>
12
  <option name="SHOW_DIALOG" value="false" />
13
  <option name="HIGHLIGHT_CONFLICTS" value="true" />
14
  <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
 
38
  <property name="last_opened_file_path" value="$PROJECT_DIR$" />
39
  <property name="settings.editor.selected.configurable" value="com.jetbrains.python.configuration.PyActiveSdkModuleConfigurable" />
40
  </component>
41
+ <component name="RecentsManager">
42
+ <key name="CopyFile.RECENT_KEYS">
43
+ <recent name="C:\Users\borod\OneDrive\Documents\Projects\GitHub\science-classification" />
44
+ </key>
45
+ </component>
46
  <component name="RunManager">
47
  <configuration name="app" type="PythonConfigurationType" factoryName="Python" temporary="true" nameIsGenerated="true">
48
  <option name="INTERPRETER_OPTIONS" value="" />
app.py CHANGED
@@ -1,59 +1,28 @@
1
  import gradio as gr
2
- from re import sub
3
- from datasets import load_dataset
4
- import pandas as pd
5
- from sklearn.feature_extraction.text import TfidfVectorizer
6
- from sklearn.svm import LinearSVC
7
 
8
 
9
- def clean_text(text):
10
- """
11
- Applies some pre-processing on the given text.
 
 
 
 
 
12
 
13
- Steps :
14
- - Lowering text
15
- - Removing backslashes
16
- - removes a. out of the answers
17
- - replaces 'b.', 'c.', and 'd.' with comma ','
18
- """
19
- text = text.lower()
20
- text = text.replace('\\', '') # generic replace was advised by Danit
21
- text = text.replace('a.', '')
22
- text = text.replace('b.', ',')
23
- text = text.replace('c.', ',')
24
- text = text.replace('d.', ',')
25
- text = sub("\d+", "<num>", text)
26
 
27
- return ' '.join(text.split())
 
28
 
 
29
 
30
- dataset = load_dataset(
31
- "csv",
32
- data_files={
33
- "train": "https://huggingface.co/borodache/science-classification-data/resolve/main/subjects-questions.csv"
34
- },
35
- )
36
 
37
- df = dataset["train"]
38
-
39
- # prepare data
40
- df = df[df['category'].isin(['Physics', 'Chemistry', 'Maths', 'Biology'])]
41
- questions = df['question']
42
- new_questions = []
43
- for question in questions:
44
- new_questions.append(clean_text(question))
45
-
46
- X = new_questions
47
- y = df['category']
48
- tfidf_vectorizer = TfidfVectorizer(stop_words="english",
49
- preprocessor=clean_text,
50
- ngram_range=(1, 2))
51
-
52
- features_tfidf = tfidf_vectorizer.fit_transform(X)
53
-
54
- # Training
55
- model = LinearSVC()
56
- model.fit(features_tfidf, y)
57
 
58
 
59
  def classify(sentence):
 
1
  import gradio as gr
2
+ from sklearn.pipeline import Pipeline
3
+ import joblib
 
 
 
4
 
5
 
6
+ class CustomTextClassificationPipeline(Pipeline):
7
+ def __init__(self):
8
+ tfidf_vectorizer = joblib.load("tfidf_vectorizer.joblib")
9
+ linear_svc = joblib.load("model_linear_svc.joblib")
10
+ super().__init__([
11
+ ('tfidf', tfidf_vectorizer),
12
+ ('classifier', linear_svc)
13
+ ])
14
 
15
+ def predict(self, text):
16
+ # Call the parent predict method to get the list of predicted labels
17
+ y_pred_list = super().predict([text])
 
 
 
 
 
 
 
 
 
 
18
 
19
+ # Convert the list to a string by taking the first element
20
+ y_pred_str = str(y_pred_list[0])
21
 
22
+ return y_pred_str
23
 
 
 
 
 
 
 
24
 
25
+ model = CustomTextClassificationPipeline()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
 
28
  def classify(sentence):
model_linear_svc.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7b049ab18b3d50cc79374c0ab9b544ad18807b063c65f82c9fcd62709b0a1fc
3
+ size 17757920
requirements.txt CHANGED
@@ -1,5 +1,3 @@
1
  gradio
2
- datasets
3
- pandas
4
- huggingface_hub
5
  scikit-learn
 
 
1
  gradio
 
 
 
2
  scikit-learn
3
+ joblib
tfidf_vectorizer.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ceb0a87b45261e3cb093b72d2c565dea618c7bd63b9e6f6c16cab1b9a4906d53
3
+ size 24881984
utils.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from re import sub
2
+
3
+
4
+ def clean_text(text):
5
+ """
6
+ Applies some pre-processing on the given text.
7
+
8
+ Steps :
9
+ - Lowering text
10
+ - Removing backslashes
11
+ - removes a. out of the answers
12
+ - replaces 'b.', 'c.', and 'd.' with comma ','
13
+ """
14
+ text = text.lower()
15
+ text = text.replace('\\', '') # generic replace was advised by Danit
16
+ text = text.replace('a.', '')
17
+ text = text.replace('b.', ',')
18
+ text = text.replace('c.', ',')
19
+ text = text.replace('d.', ',')
20
+ text = sub("\d+", "<num>", text)
21
+
22
+ return ' '.join(text.split())