API details.
import os
import time
import sys
import inspect
import pandas as pd
import numpy as np
import datetime
try:
from fasttext import train_unsupervised
import fasttext
except:
from fastText import train_unsupervised
import fastText
import umap.umap_ as umap
import hdbscan
import plotly.graph_objs as go
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import Image,display
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
Get Data¶
fasttext_data_folder = "../data/fasttext_data"
train_data = os.path.join(fasttext_data_folder,"text_file.text")
text_df = pd.read_csv(train_data, names=["itemDesc"])
text_df.head()
FastText training¶
cbow_model = os.path.join(fasttext_data_folder,"model_cbow.bin")
skipgram_model = os.path.join(fasttext_data_folder,"model_skipgram.bin")
train_unsupervised parameters
* input # training file path (required)
lr # learning rate [0.1]
dim # size of word vectors [100]
ws # size of the context window [5]
epoch # number of epochs [5]
minCount # minimal number of word occurences [1]
minCountLabel # minimal number of label occurences [1]
minn # min length of char ngram [0]
maxn # max length of char ngram [0]
neg # number of negatives sampled [5]
wordNgrams # max length of word ngram [1]
loss # loss function {ns, hs, softmax, ova} [softmax]
bucket # number of buckets [2000000]
thread # number of threads [number of cpus]
lrUpdateRate # change the rate of updates for the learning rate [100]
t # sampling threshold [0.0001]
label # label prefix ['__label__']
verbose # verbose [2]
pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []
model_to_use = "skipgram"
if model_to_use == "skipgram":
# Skipgram model :
model_skipgram = train_unsupervised(train_data, model='skipgram')
model_skipgram.save_model(skipgram_model)
else:
#Cbow model :
model_cbow = train_unsupervised(train_data, model='cbow')
model_cbow.save_model(cbow_model)
# Loading of the fastext pretained model
model_skipgram=fasttext.load_model(skipgram_model)
model_skipgram.get_dimension()
# The get_sentence_vector takes all vectors for all the words in the query, divide each of them by their respective norm, and then average all vectors together
def query_to_vector(col_query, model_fastText):
vector = col_query.apply(lambda x:model_fastText.get_sentence_vector(x.replace('\n',' ')))
return vector
%timeit text_df['vector'] = query_to_vector(text_df['itemDesc'], model_skipgram)
del model_skipgram # We do not need the pretrained-vector in memory
text_df.head(10)
Dimension reduction with UMAP¶
# We regroup all the vectors as a numpy array
vecs=text_df.vector.values
vecs=np.stack(vecs, axis=0)
vecs.shape
col_names = ["FT_"+str(x) for x in range(0,100)]
print(len(col_names))
test = pd.DataFrame(vecs, columns=col_names)
fit = umap.UMAP(n_neighbors=30,min_dist=0.1,n_components=3,metric='cosine',random_state=42)
%time u = fit.fit_transform(vecs)
text_df['x']=u[:,0]
text_df['y']=u[:,1]
text_df['z']=u[:,2]
u.shape
3D Vizualisation¶
# This is the function to plot the queries in the embedding space.
# Here we reduce the embedding to a 3 dimensionnal space
def plot_cluster(df,iscolored=False,name='',interactive=True):
if interactive:
if iscolored:
color=df['cluster'].values
else: color = df['x']
trace1 = go.Scatter3d(
x=df['x'],
y=df['y'],
z=df['z'],
mode='markers',
marker=dict(
size=3,
color=color, # set color to an array/list of desired values
colorscale='Viridis', # choose a colorscale
opacity=0.3
),
text=color
)
data = [trace1]
layout = go.Layout(
margin=dict(
l=0,
r=0,
b=0,
t=0
)
)
fig = go.Figure(data=data, layout=layout)
file='../data/fasttext_data/'+name+'.html'
py.iplot(fig, filename=file)
py.plot(fig, filename=file,auto_open=False)
is_interactive=False
filename='umap_embedding_description'
if is_interactive:
plot_cluster(text_df,False,filename)
Clustering with HDBSCAN¶
clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=5)
clusterer
%time text_df['cluster']= clusterer.fit_predict(u)
text_df.groupby(['cluster'])['itemDesc'].apply(list).apply(len).sort_values(ascending=False)[0:20]
# The cluster number -1 means actually that the algorithm considered it as noise.
# So we will remove the noise
denoised = text_df[text_df.cluster!=-1]
denoised.head()
# Number of clusters
print('Total number of clusters: '+str(len(denoised.cluster.unique())))
clusters=denoised.groupby(['cluster'])['itemDesc'].apply(list)
cluster_i = clusters[0]
print(len(cluster_i))
cluster_i
TF-IDF into the clusters¶
def get_top_grams(cluster_j):
tokens = nltk.word_tokenize(" ".join(cluster_j))
#Create your bigrams
tgs = nltk.trigrams(tokens)
bgs = nltk.bigrams(tokens)
#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(tgs)
trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(bgs)
bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(tokens)
onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]
return trigram, bigram, onegram
# Cluster description
for i, cluster_i in enumerate(clusters):
print("---------")
print("cluster {} size:{}".format(i,len(cluster_i)))
print(get_top_grams(cluster_i))
tokens = nltk.word_tokenize(" ".join(cluster_i))
#Create your bigrams
tgs = nltk.trigrams(tokens)
bgs = nltk.bigrams(tokens)
#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(tgs)
trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(bgs)
bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(tokens)
onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]
print(trigram, bigram, onegram)