API details.
import os
import time
import sys
import inspect

import pandas as pd
import numpy as np

import datetime

try:
    from fasttext import train_unsupervised
    import fasttext
except:
    from fastText import train_unsupervised
    import fastText
    
import umap.umap_ as umap
import hdbscan

import plotly.graph_objs as go
import plotly.offline as py
import plotly.graph_objs as go
from IPython.display import Image,display
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

Get Data

fasttext_data_folder = "../data/fasttext_data"
train_data = os.path.join(fasttext_data_folder,"text_file.text")
text_df = pd.read_csv(train_data, names=["itemDesc"])
text_df.head()
itemDesc
0 MINI DONUTS FOURRES ASSORTI X6
1 NAVETTE ASSORTIMENT X30
2 BRIOCHE MOUNA 400G
3 BRIOCHETTE ST GENIX X4 LOCAL
4 BRIOCHE ST GENIX LOCAL

FastText training

cbow_model = os.path.join(fasttext_data_folder,"model_cbow.bin")
skipgram_model = os.path.join(fasttext_data_folder,"model_skipgram.bin")

train_unsupervised parameters

* input             # training file path (required)
lr                # learning rate [0.1]
dim               # size of word vectors [100]
ws                # size of the context window [5]
epoch             # number of epochs [5]
minCount          # minimal number of word occurences [1]
minCountLabel     # minimal number of label occurences [1]
minn              # min length of char ngram [0]
maxn              # max length of char ngram [0]
neg               # number of negatives sampled [5]
wordNgrams        # max length of word ngram [1]
loss              # loss function {ns, hs, softmax, ova} [softmax]
bucket            # number of buckets [2000000]
thread            # number of threads [number of cpus]
lrUpdateRate      # change the rate of updates for the learning rate [100]
t                 # sampling threshold [0.0001]
label             # label prefix ['__label__']
verbose           # verbose [2]
pretrainedVectors # pretrained word vectors (.vec file) for supervised learning []
model_to_use = "skipgram"

if model_to_use == "skipgram":
    # Skipgram model :
    model_skipgram = train_unsupervised(train_data, model='skipgram')
    model_skipgram.save_model(skipgram_model)

else:
    #Cbow model :
    model_cbow = train_unsupervised(train_data, model='cbow')
    model_cbow.save_model(cbow_model)
# Loading of the fastext pretained model
model_skipgram=fasttext.load_model(skipgram_model)

model_skipgram.get_dimension()
100
# The get_sentence_vector takes all vectors for all the words in the query, divide each of them by their respective norm, and then average all vectors together
def query_to_vector(col_query, model_fastText):
    vector = col_query.apply(lambda x:model_fastText.get_sentence_vector(x.replace('\n',' ')))
    return vector
%timeit text_df['vector'] = query_to_vector(text_df['itemDesc'], model_skipgram)
del model_skipgram # We do not need the pretrained-vector in memory
The slowest run took 4.69 times longer than the fastest. This could mean that an intermediate result is being cached.
20.5 ms ± 15.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
text_df.head(10)
itemDesc vector
0 MINI DONUTS FOURRES ASSORTI X6 [0.021459619, 0.10777775, -0.015001227, -0.022...
1 NAVETTE ASSORTIMENT X30 [0.014762469, 0.046633586, -0.042754717, -0.01...
2 BRIOCHE MOUNA 400G [-0.052082933, 0.0019508253, 0.006112094, -0.0...
3 BRIOCHETTE ST GENIX X4 LOCAL [-0.010170468, -0.015941408, -0.0090812, -0.02...
4 BRIOCHE ST GENIX LOCAL [-0.017857304, -0.030833643, -0.0024532434, -0...
5 NAVETTE X20 [0.026398266, -0.0045050173, -0.047125924, -0....
6 TARTE AU SUCRE 6P LOCAL [-0.02277725, -0.0873375, -0.012378287, 0.0324...
7 MINI SUISSE X8 [0.09523194, 0.010796974, 0.14384931, -0.13409...
8 4 P RAIS+4 P CHOC.+4 CROIS. PB [0.011881388, 0.034624543, -0.043392614, 0.031...
9 X5 POCHE TALOAK [0.011327958, 0.00809449, 0.006738869, -0.0332...

Dimension reduction with UMAP

# We regroup all the vectors as a numpy array
vecs=text_df.vector.values
vecs=np.stack(vecs, axis=0)
vecs.shape
(583, 100)
col_names = ["FT_"+str(x) for x in range(0,100)]
print(len(col_names))
test = pd.DataFrame(vecs, columns=col_names)
100
fit = umap.UMAP(n_neighbors=30,min_dist=0.1,n_components=3,metric='cosine',random_state=42)
%time u = fit.fit_transform(vecs)

text_df['x']=u[:,0]
text_df['y']=u[:,1]
text_df['z']=u[:,2]
CPU times: user 1.43 s, sys: 59.1 ms, total: 1.49 s
Wall time: 1.4 s
u.shape
(583, 3)

3D Vizualisation

# This is the function to plot the queries in the embedding space.
# Here we reduce the embedding to a 3 dimensionnal space
def plot_cluster(df,iscolored=False,name='',interactive=True):
    
    if interactive:
        if iscolored:
            color=df['cluster'].values
        else: color = df['x']
        trace1 = go.Scatter3d(
            x=df['x'],
            y=df['y'],
            z=df['z'],

            mode='markers',
            marker=dict(
                size=3,
                color=color,                # set color to an array/list of desired values
                colorscale='Viridis',   # choose a colorscale
                opacity=0.3
            ),
            text=color
        )

        data = [trace1]
        layout = go.Layout(
            margin=dict(
                l=0,
                r=0,
                b=0,
                t=0
            )
        )
        fig = go.Figure(data=data, layout=layout)
        file='../data/fasttext_data/'+name+'.html'
        py.iplot(fig, filename=file)
        py.plot(fig, filename=file,auto_open=False)
is_interactive=False
filename='umap_embedding_description'

if is_interactive:
    plot_cluster(text_df,False,filename)

Clustering with HDBSCAN

clusterer = hdbscan.HDBSCAN(min_cluster_size=5,min_samples=5)
clusterer
HDBSCAN(algorithm='best', allow_single_cluster=False, alpha=1.0,
        approx_min_span_tree=True, cluster_selection_epsilon=0.0,
        cluster_selection_method='eom', core_dist_n_jobs=4,
        gen_min_span_tree=False, leaf_size=40,
        match_reference_implementation=False, memory=Memory(location=None),
        metric='euclidean', min_cluster_size=5, min_samples=5, p=None,
        prediction_data=False)
%time text_df['cluster']= clusterer.fit_predict(u)
CPU times: user 26 ms, sys: 5.75 ms, total: 31.8 ms
Wall time: 33.9 ms
text_df.groupby(['cluster'])['itemDesc'].apply(list).apply(len).sort_values(ascending=False)[0:20]
cluster
-1     153
 13     95
 16     29
 1      28
 18     23
 8      22
 2      19
 25     19
 23     17
 9      16
 21     14
 3      14
 4      13
 5      13
 19     12
 6      11
 14     10
 10      9
 22      8
 12      8
Name: itemDesc, dtype: int64
# The cluster number -1 means actually that the algorithm considered it as noise.
# So we will remove the noise
denoised = text_df[text_df.cluster!=-1]
denoised.head()
itemDesc vector x y z cluster
0 MINI DONUTS FOURRES ASSORTI X6 [0.021459619, 0.10777775, -0.015001227, -0.022... 1.819945 8.172493 5.891437 8
2 BRIOCHE MOUNA 400G [-0.052082933, 0.0019508253, 0.006112094, -0.0... 5.309334 5.404963 3.979075 23
3 BRIOCHETTE ST GENIX X4 LOCAL [-0.010170468, -0.015941408, -0.0090812, -0.02... 5.848576 6.289163 5.398617 18
4 BRIOCHE ST GENIX LOCAL [-0.017857304, -0.030833643, -0.0024532434, -0... 5.889108 6.089911 5.217195 18
7 MINI SUISSE X8 [0.09523194, 0.010796974, 0.14384931, -0.13409... 2.519321 7.063348 5.902770 10
# Number of clusters
print('Total number of clusters: '+str(len(denoised.cluster.unique())))
Total number of clusters: 26
clusters=denoised.groupby(['cluster'])['itemDesc'].apply(list)
cluster_i = clusters[0]
print(len(cluster_i))
cluster_i
6
['4 MICRO DONUTS ROSE',
 '4 MICRO DONUTS CHOCO',
 '4 MICRO DONUTS BLANC CONFETTI',
 '4 MICRO DONUTS CHOCO POPPIES',
 '4 MICRO DONUTS ROSE POPPIES',
 '4 MICRO DONUTS BLANC CONFETTI']

TF-IDF into the clusters

def get_top_grams(cluster_j):
    tokens = nltk.word_tokenize(" ".join(cluster_j))

    #Create your bigrams
    tgs = nltk.trigrams(tokens)
    bgs = nltk.bigrams(tokens)

    #compute frequency distribution for all the bigrams in the text
    fdist = nltk.FreqDist(tgs)
    trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(bgs)
    bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
    fdist = nltk.FreqDist(tokens)
    onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

    return trigram, bigram, onegram
# Cluster description
for i, cluster_i in enumerate(clusters):
    print("---------")
    print("cluster {} size:{}".format(i,len(cluster_i)))
    print(get_top_grams(cluster_i))
---------
cluster 0 size:6
(('4 MICRO DONUTS', 6), ('4 MICRO', 6), ('4', 6))
---------
cluster 1 size:28
(('X4 DONUTS ASSORTIS', 3), ('X4 DONUTS', 13), ('DONUTS', 23))
---------
cluster 2 size:19
(('X2 DONUT FOURRE', 4), ('X2 DONUTS', 10), ('X2', 18))
---------
cluster 3 size:14
(('POGNE DE ROMANS', 5), ('POGNE DE', 5), ('500G', 8))
---------
cluster 4 size:13
(('CHAUSSONS AUX POMMES', 7), ('AUX POMMES', 12), ('AUX', 13))
---------
cluster 5 size:13
(('PASTEIS DE NATA', 5), ('DE NATA', 8), ('X4', 10))
---------
cluster 6 size:11
(('PEPITES DE CHOCO', 3), ('BRESSANE PEPITES', 3), ('PEPITES', 8))
---------
cluster 7 size:8
(('PAINS AUX RAISINS', 4), ('AUX RAISINS', 7), ('AUX', 8))
---------
cluster 8 size:22
(('MINI DONUTS FOURRES', 2), ('MINI VIENNOISERIE', 9), ('MINI', 20))
---------
cluster 9 size:16
(('4 + 1', 3), ('550 G', 4), ('BRETZELS', 11))
---------
cluster 10 size:9
(('X8 200G MINI', 6), ('X8 200G', 6), ('MINI', 9))
---------
cluster 11 size:7
(('BOULE BRIOCHE SUCRE', 3), ('BOULE BRIOCHE', 6), ('BOULE', 6))
---------
cluster 12 size:8
(('+10 % BLISTER', 2), ('BLISTER BUGNES', 3), ('220G', 6))
---------
cluster 13 size:95
(('AU BEURRE AOP', 7), ('PUR BEURRE', 18), ('PAIN', 32))
---------
cluster 14 size:10
(('TRIANGLE AMANDE X2', 2), ('AMANDE X2', 4), ('X2', 7))
---------
cluster 15 size:7
(('X12 MINI BEIGNET', 3), ('MINI BEIGNET', 5), ('BEIGNET', 7))
---------
cluster 16 size:29
(('BEIGNET POMME X2', 3), ('MAXI BEIGNET', 5), ('BEIGNET', 15))
---------
cluster 17 size:7
(('ANIMATION LOCAL BEIGNET', 4), ('LOCAL BEIGNET', 6), ('BEIGNET', 7))
---------
cluster 18 size:23
(('LOCAL BRIOCHE COURONNE', 3), ('LOCAL BRIOCHE', 16), ('LOCAL', 20))
---------
cluster 19 size:12
(('300G BRIOCHE NATURE', 1), ('300G BRIOCHE', 5), ('300G', 8))
---------
cluster 20 size:8
(('BRIOCHE FEUILLETEE 350G', 1), ('350G BRIOCHE', 3), ('BRIOCHE', 8))
---------
cluster 21 size:14
(('LOCAL TARTE BRESSANE', 5), ('TARTE BRESSANE', 6), ('BRESSANE', 14))
---------
cluster 22 size:8
(('600G BRIOCHE TRESSEE', 2), ('BRIOCHE TRESSEE', 5), ('BRIOCHE', 8))
---------
cluster 23 size:17
(('400G BRIOCHE NANTERRE', 3), ('400G BRIOCHE', 13), ('BRIOCHE', 17))
---------
cluster 24 size:7
(('LOCAL JESUITES X2', 2), ('X2 LOCAL', 6), ('LOCAL', 7))
---------
cluster 25 size:19
(('X 4 LOCAL', 3), ('LOCAL COQUILLE', 6), ('LOCAL', 18))
tokens = nltk.word_tokenize(" ".join(cluster_i))

#Create your bigrams
tgs = nltk.trigrams(tokens)
bgs = nltk.bigrams(tokens)

#compute frequency distribution for all the bigrams in the text
fdist = nltk.FreqDist(tgs)
trigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(bgs)
bigram = " ".join(fdist.most_common(1)[0][0]), fdist.most_common(1)[0][1]
fdist = nltk.FreqDist(tokens)
onegram = fdist.most_common(1)[0][0],fdist.most_common(1)[0][1]

print(trigram, bigram, onegram)
('X 4 LOCAL', 3) ('LOCAL COQUILLE', 6) ('LOCAL', 18)