Ref: https://www.kaggle.com/jhoward/nb-svm-strong-linear-baseline

import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from random import sample 
import re, string

import pandas as pd
import warnings
warnings.filterwarnings('ignore')

Get and prepare the data¶

train = pd.read_csv("../data/labled_train_set.csv")
test = pd.read_csv("../data/unlabled_test_set.csv")

len(train)

29424

train.head()

cat_train = train.category.value_counts()
cat_train

NEUTRE       11139
NEGATIF      10487
POSITIF       5862
MIXPOSNEG     1936
Name: category, dtype: int64

test = test.rename(columns={"tweet":"comment_text"})
test.head()

train = train.reset_index()
label_df = pd.get_dummies(train["category"])

label_cols = list(label_df.columns)
print(label_cols)

train = pd.concat([train, label_df],axis=1)
train = train.drop("category",axis=1)
train = train.rename(columns={"index": "id","tweet":"comment_text"})
train.head()

['MIXPOSNEG', 'NEGATIF', 'NEUTRE', 'POSITIF']

train['comment_text'][0]

'"@MathiasColines Bjr  tous les trains desservent effectivement la gare du Parc des Expositions du début à fin de service. Bonne journée ^T"'

train['comment_text'][4]

'"Je suis bloquée dans le bus sur le periph et le conducteur ne sait pas comment sortir ! #325 @GroupeRATP l\' aider par radio  c\' est possible ?"'

train['none'] = 1-train[label_cols].max(axis=1)
train.describe()

---------------------------------------------------------------------------
ModuleNotFoundError                       Traceback (most recent call last)
ModuleNotFoundError: No module named 'numpy.core._multiarray_umath'

COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)

re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()

Model¶

n = train.shape[0]
print(n)

vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
               min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
               smooth_idf=1, sublinear_tf=1 )

trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])

29424

trn_term_doc, test_term_doc

(<29424x43432 sparse matrix of type '<class 'numpy.float64'>'
 	with 1205034 stored elements in Compressed Sparse Row format>,
 <7357x43432 sparse matrix of type '<class 'numpy.float64'>'
 	with 292053 stored elements in Compressed Sparse Row format>)

# Here's the basic naive bayes feature equation:
def pr(y_i, y):
    p = x[y==y_i].sum(0)
    return (p+1) / ((y==y_i).sum()+1)

x = trn_term_doc
test_x = test_term_doc

# Fit a model for one dependent at a time:
def get_mdl(y):
    y = y.values
    r = np.log(pr(1,y) / pr(0,y))
    m = LogisticRegression(C=4, dual=True)
    x_nb = x.multiply(r)
    return m.fit(x_nb, y), r

preds = np.zeros((len(test), len(label_cols)))

for i, j in enumerate(label_cols):
    print('fit', j)
    m,r = get_mdl(train[j])
    preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]

fit MIXPOSNEG
fit NEGATIF
fit NEUTRE
fit POSITIF

test.head()

submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)

submission.head()

	id	comment_text
0	0	"@placardobalais Ils prenaient des bus différe...
1	1	"@Rowlfg c' est loin de chez elle le ministère...
2	2	"3 € en plus du Navigo ! On en parle de la pro...
3	3	"@Skyschips 😂😂😂 fou rire dans le bus ma blague...
4	4	"Le 11 octobre je fais de ma voiture un bus et...

	id	MIXPOSNEG	NEGATIF	NEUTRE	POSITIF	none
count	29424.000000	29424.000000	29424.000000	29424.000000	29424.000000	29424.0
mean	14711.500000	0.065797	0.356410	0.378569	0.199225	0.0
std	8494.121497	0.247930	0.478946	0.485039	0.399424	0.0
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.0
25%	7355.750000	0.000000	0.000000	0.000000	0.000000	0.0
50%	14711.500000	0.000000	0.000000	0.000000	0.000000	0.0
75%	22067.250000	0.000000	1.000000	1.000000	0.000000	0.0
max	29423.000000	1.000000	1.000000	1.000000	1.000000	0.0

	id	comment_text
0	0	"@placardobalais Ils prenaient des bus différe...
1	1	"@Rowlfg c' est loin de chez elle le ministère...
2	2	"3 € en plus du Navigo ! On en parle de la pro...
3	3	"@Skyschips 😂😂😂 fou rire dans le bus ma blague...
4	4	"Le 11 octobre je fais de ma voiture un bus et...

	id	MIXPOSNEG	NEGATIF	NEUTRE	POSITIF
0	0	0.007891	0.768337	0.188729	0.064118
1	1	0.006476	0.145842	0.869466	0.117607
2	2	0.007517	0.209492	0.498004	0.131356
3	3	0.030724	0.060470	0.099704	0.649403
4	4	0.008263	0.250806	0.453443	0.404758

	tweet	category
0	"@MathiasColines Bjr tous les trains desserve...	POSITIF
1	"@IsaDuquette s' il trippe manège Coney Islan...	POSITIF
2	"@oger_dominique Bonsoir votre train circule ...	NEGATIF
3	"@SNCF a quand plus de trains gare de vert de ...	NEGATIF
4	"Je suis bloquée dans le bus sur le periph et ...	MIXPOSNEG

NBSVM

Get and prepare the data¶

Model¶