API details.
import pandas as pd, numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from random import sample
import re, string
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
Get and prepare the data¶
train = pd.read_csv("../data/labled_train_set.csv")
test = pd.read_csv("../data/unlabled_test_set.csv")
len(train)
train.head()
cat_train = train.category.value_counts()
cat_train
test = test.rename(columns={"tweet":"comment_text"})
test.head()
train = train.reset_index()
label_df = pd.get_dummies(train["category"])
label_cols = list(label_df.columns)
print(label_cols)
train = pd.concat([train, label_df],axis=1)
train = train.drop("category",axis=1)
train = train.rename(columns={"index": "id","tweet":"comment_text"})
train.head()
train['comment_text'][0]
train['comment_text'][4]
train['none'] = 1-train[label_cols].max(axis=1)
train.describe()
COMMENT = 'comment_text'
train[COMMENT].fillna("unknown", inplace=True)
test[COMMENT].fillna("unknown", inplace=True)
re_tok = re.compile(f'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')
def tokenize(s): return re_tok.sub(r' \1 ', s).split()
Model¶
n = train.shape[0]
print(n)
vec = TfidfVectorizer(ngram_range=(1,2), tokenizer=tokenize,
min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
smooth_idf=1, sublinear_tf=1 )
trn_term_doc = vec.fit_transform(train[COMMENT])
test_term_doc = vec.transform(test[COMMENT])
trn_term_doc, test_term_doc
# Here's the basic naive bayes feature equation:
def pr(y_i, y):
p = x[y==y_i].sum(0)
return (p+1) / ((y==y_i).sum()+1)
x = trn_term_doc
test_x = test_term_doc
# Fit a model for one dependent at a time:
def get_mdl(y):
y = y.values
r = np.log(pr(1,y) / pr(0,y))
m = LogisticRegression(C=4, dual=True)
x_nb = x.multiply(r)
return m.fit(x_nb, y), r
preds = np.zeros((len(test), len(label_cols)))
for i, j in enumerate(label_cols):
print('fit', j)
m,r = get_mdl(train[j])
preds[:,i] = m.predict_proba(test_x.multiply(r))[:,1]
test.head()
submid = pd.DataFrame({'id': test["id"]})
submission = pd.concat([submid, pd.DataFrame(preds, columns = label_cols)], axis=1)
submission.head()