<html><head></head><body><div style="color:#000; background-color:#fff; font-family:HelveticaNeue, Helvetica Neue, Helvetica, Arial, Lucida Grande, sans-serif;font-size:16px"><div id="yui_3_16_0_ym19_1_1481293323206_7598">Hi all,</div><div id="yui_3_16_0_ym19_1_1481293323206_7599"><br></div><div id="yui_3_16_0_ym19_1_1481293323206_7600">My name is Fábio and I'm new in scikit, and I trying to cluster information from one file with python script (i fount on web). But i saw that the output had problem with numbers...See:</div><div id="yui_3_16_0_ym19_1_1481293323206_7600"><br></div><div id="yui_3_16_0_ym19_1_1481293323206_7724">Script#</div><div id="yui_3_16_0_ym19_1_1481293323206_7724"><br></div><div id="yui_3_16_0_ym19_1_1481293323206_7726">import click</div><div id="yui_3_16_0_ym19_1_1481293323206_7727">import re</div><div id="yui_3_16_0_ym19_1_1481293323206_7728">import numpy</div><div id="yui_3_16_0_ym19_1_1481293323206_7729">import random</div><div id="yui_3_16_0_ym19_1_1481293323206_7730"><br id="yui_3_16_0_ym19_1_1481293323206_7731"></div><div id="yui_3_16_0_ym19_1_1481293323206_7732">from collections import defaultdict</div><div id="yui_3_16_0_ym19_1_1481293323206_7733"><br id="yui_3_16_0_ym19_1_1481293323206_7734"></div><div id="yui_3_16_0_ym19_1_1481293323206_7735">from sklearn.feature_extraction.text import TfidfVectorizer</div><div id="yui_3_16_0_ym19_1_1481293323206_7736">from sklearn.cluster import KMeans</div><div id="yui_3_16_0_ym19_1_1481293323206_7737"><br id="yui_3_16_0_ym19_1_1481293323206_7738"></div><div id="yui_3_16_0_ym19_1_1481293323206_7739"><br id="yui_3_16_0_ym19_1_1481293323206_7740"></div><div id="yui_3_16_0_ym19_1_1481293323206_7741">@click.command()</div><div id="yui_3_16_0_ym19_1_1481293323206_7742">@click.argument('filename')</div><div id="yui_3_16_0_ym19_1_1481293323206_7743">@click.option('--clusters', default=50, help='Number of clusters')</div><div id="yui_3_16_0_ym19_1_1481293323206_7744">@click.option('--sample', default=400, help='Number of samples to print')</div><div id="yui_3_16_0_ym19_1_1481293323206_7745">def cluster_lines(filename, clusters, sample):</div><div id="yui_3_16_0_ym19_1_1481293323206_7746"> lines = numpy.array(list(_get_lines(filename)))</div><div id="yui_3_16_0_ym19_1_1481293323206_7747"><br id="yui_3_16_0_ym19_1_1481293323206_7748"></div><div id="yui_3_16_0_ym19_1_1481293323206_7749"> doc_feat = TfidfVectorizer().fit_transform(lines)</div><div id="yui_3_16_0_ym19_1_1481293323206_7750"> km = KMeans(clusters).fit(doc_feat)</div><div id="yui_3_16_0_ym19_1_1481293323206_7751"><br id="yui_3_16_0_ym19_1_1481293323206_7752"></div><div id="yui_3_16_0_ym19_1_1481293323206_7753"> k = 0</div><div id="yui_3_16_0_ym19_1_1481293323206_7754"> clusters = defaultdict(list)</div><div id="yui_3_16_0_ym19_1_1481293323206_7755"> for i in km.labels_:</div><div id="yui_3_16_0_ym19_1_1481293323206_7756"> clusters[i].append(lines[k])</div><div id="yui_3_16_0_ym19_1_1481293323206_7757"> k += 1</div><div id="yui_3_16_0_ym19_1_1481293323206_7758"><br id="yui_3_16_0_ym19_1_1481293323206_7759"></div><div id="yui_3_16_0_ym19_1_1481293323206_7760"> s_clusters = sorted(clusters.values(), key=lambda l: -len(l))</div><div id="yui_3_16_0_ym19_1_1481293323206_7761"><br id="yui_3_16_0_ym19_1_1481293323206_7762"></div><div id="yui_3_16_0_ym19_1_1481293323206_7763"> for cluster in s_clusters:</div><div id="yui_3_16_0_ym19_1_1481293323206_7764"> print 'Cluster [%s]:' % len(cluster)</div><div id="yui_3_16_0_ym19_1_1481293323206_7765"> if len(cluster) > sample:</div><div id="yui_3_16_0_ym19_1_1481293323206_7766"> cluster = random.sample(cluster, sample)</div><div id="yui_3_16_0_ym19_1_1481293323206_7767"> for line in cluster:</div><div id="yui_3_16_0_ym19_1_1481293323206_7768"> print line</div><div id="yui_3_16_0_ym19_1_1481293323206_7769"> print '--------'</div><div id="yui_3_16_0_ym19_1_1481293323206_7770"><br id="yui_3_16_0_ym19_1_1481293323206_7771"></div><div id="yui_3_16_0_ym19_1_1481293323206_7772"><br id="yui_3_16_0_ym19_1_1481293323206_7773"></div><div id="yui_3_16_0_ym19_1_1481293323206_7774">def _clean_line(line):</div><div id="yui_3_16_0_ym19_1_1481293323206_7775"> line = line.strip().lower()</div><div id="yui_3_16_0_ym19_1_1481293323206_7776"> line = re.sub('\d+', '(N)', line)</div><div id="yui_3_16_0_ym19_1_1481293323206_7777"> return line</div><div id="yui_3_16_0_ym19_1_1481293323206_7778"><br id="yui_3_16_0_ym19_1_1481293323206_7779"></div><div id="yui_3_16_0_ym19_1_1481293323206_7780"><br id="yui_3_16_0_ym19_1_1481293323206_7781"></div><div id="yui_3_16_0_ym19_1_1481293323206_7782">def _get_lines(filename):</div><div id="yui_3_16_0_ym19_1_1481293323206_7783"> for line in open(filename).readlines():</div><div id="yui_3_16_0_ym19_1_1481293323206_7784"> yield _clean_line(line)</div><div id="yui_3_16_0_ym19_1_1481293323206_7785"><br id="yui_3_16_0_ym19_1_1481293323206_7786"></div><div id="yui_3_16_0_ym19_1_1481293323206_7787"><br id="yui_3_16_0_ym19_1_1481293323206_7788"></div><div id="yui_3_16_0_ym19_1_1481293323206_7789">if __name__ == '__main__':</div><div id="yui_3_16_0_ym19_1_1481293323206_7790"> cluster_lines()</div><div id="yui_3_16_0_ym19_1_1481293323206_7790"><br></div><div id="yui_3_16_0_ym19_1_1481293323206_7790">output </div><div id="yui_3_16_0_ym19_1_1481293323206_7893">[root@vmcaiosyscolprod01 71001492]# python Cluster-LearnMachine.py DataSets/ospf.teste3</div><div id="yui_3_16_0_ym19_1_1481293323206_7894">Cluster [7]:</div><div id="yui_3_16_0_ym19_1_1481293323206_7895">"rjbotaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7896">"rjmteab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7897">"rjmckaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7898">"rjdqcaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7899">"rjdqcab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7900">"rjcenaa max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7901">"rjcenab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7902">--------</div><div id="yui_3_16_0_ym19_1_1481293323206_7903">Cluster [1]:</div><div id="yui_3_16_0_ym19_1_1481293323206_7904">"rjbotab max-metric router-lsa on-startup log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7905">--------</div><div id="yui_3_16_0_ym19_1_1481293323206_7906">Cluster [1]:</div><div id="yui_3_16_0_ym19_1_1481293323206_7907">"rjmteaa ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7908">--------</div><div id="yui_3_16_0_ym19_1_1481293323206_7909">Cluster [1]:</div><div id="yui_3_16_0_ym19_1_1481293323206_7910">"rjmckab max-metric router-lsa on-startup ispf log-adjacency-changes detail auto-cost reference-bandwidth timers throttle spf timers throttle lsa timers lsa arrival timers pacing flood passive-interface default maximum-paths mpls ldp sync mpls traffic-eng router-id loopback mpls traffic-eng area"</div><div id="yui_3_16_0_ym19_1_1481293323206_7911">--------</div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7912"><br id="yui_3_16_0_ym19_1_1481293323206_7913"></div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7912"><br></div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7912">See that the output shown (N) on numbers, and i'm not fount a way to use the big cluster as a template fo fount diference between the bigger cluster and others clusters. How can i do that?</div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7912"><br></div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7912">Thanks</div><div id="yui_3_16_0_ym19_1_1481293323206_7790"><br></div><div dir="ltr" id="yui_3_16_0_ym19_1_1481293323206_7791"><br id="yui_3_16_0_ym19_1_1481293323206_7792"></div></div></body></html>