029_TweetLanguageClassifier(Scala)

Twitter Streaming Language Classifier

This is a databricksification of https://databricks.gitbooks.io/databricks-spark-reference-applications/content/twitter_classifier/index.html by Amendra Shreshta.

Note that you need to change the fields in background notebooks like 025_a_extendedTwitterUtils2run as explained in the corresponding videos by Amendra.

%run "scalable-data-science/sds-2-2-360-in-525-02/025_a_extendedTwitterUtils2run"
import twitter4j._ import twitter4j.auth.Authorization import twitter4j.conf.ConfigurationBuilder import twitter4j.auth.OAuthAuthorization import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.receiver.Receiver
defined class ExtendedTwitterReceiver
defined class ExtendedTwitterInputDStream
import twitter4j.Status import twitter4j.auth.Authorization import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream} defined object ExtendedTwitterUtils
done running the extendedTwitterUtils2run notebook - ready to stream from twitter
import org.apache.spark._
import org.apache.spark.storage._
import org.apache.spark.streaming._

import scala.math.Ordering

import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder
import org.apache.spark._ import org.apache.spark.storage._ import org.apache.spark.streaming._ import scala.math.Ordering import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

// fill in your Twitter credentials
def MyconsumerKey       = ""
def MyconsumerSecret    = ""
def Mytoken             = ""
def MytokenSecret       = ""

System.setProperty("twitter4j.oauth.consumerKey", MyconsumerKey)
System.setProperty("twitter4j.oauth.consumerSecret", MyconsumerSecret)
System.setProperty("twitter4j.oauth.accessToken", Mytoken)
System.setProperty("twitter4j.oauth.accessTokenSecret", MytokenSecret)
import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder MyconsumerKey: String MyconsumerSecret: String Mytoken: String MytokenSecret: String res1: String = null
// Downloading tweets and building model for clustering
// ## Let's create a directory in dbfs for storing tweets in the cluster's distributed file system.
val outputDirectoryRoot = "/datasets/tweetsStreamTmp" // output directory
outputDirectoryRoot: String = /datasets/tweetsStreamTmp
// to remove a pre-existing directory and start from scratch uncomment next line and evaluate this cell
dbutils.fs.rm(outputDirectoryRoot, true) 
res2: Boolean = false
// ## Capture tweets in every sliding window of slideInterval many milliseconds.
val slideInterval = new Duration(1 * 1000) // 1 * 1000 = 1000 milli-seconds = 1 sec
slideInterval: org.apache.spark.streaming.Duration = 1000 ms
// Our goal is to take each RDD in the twitter DStream and write it as a json file in our dbfs.
// Create a Spark Streaming Context.
val ssc = new StreamingContext(sc, slideInterval)
ssc: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@3e2a07fd
// Create a Twitter Stream for the input source. 
val auth = Some(new OAuthAuthorization(new ConfigurationBuilder().build()))
val twitterStream = ExtendedTwitterUtils.createStream(ssc, auth)
auth: Some[twitter4j.auth.OAuthAuthorization] = Some(OAuthAuthorization{consumerKey='fB9Ww8Z4TIauPWKNPL6IN7xqd', consumerSecret='******************************************', oauthToken=AccessToken{screenName='null', userId=28513570}}) twitterStream: org.apache.spark.streaming.dstream.ReceiverInputDStream[twitter4j.Status] = ExtendedTwitterInputDStream@2acbe39c
// Let's import google's json library next.
import com.google.gson.Gson 
//Let's map the tweets into json formatted string (one tweet per line).
val twitterStreamJson = twitterStream.map(
                                            x => { val gson = new Gson();
                                                 val xJson = gson.toJson(x)
                                                 xJson
                                                 }
                                          ) 
import com.google.gson.Gson twitterStreamJson: org.apache.spark.streaming.dstream.DStream[String] = org.apache.spark.streaming.dstream.MappedDStream@616fc13d
val partitionsEachInterval = 1 

val batchInterval = 1 // in minutes
val timeoutJobLength =  batchInterval * 5

var newContextCreated = false
var numTweetsCollected = 0L // track number of tweets collected

twitterStreamJson.foreachRDD((rdd, time) => { // for each filtered RDD in the DStream
      val count = rdd.count()
      if (count > 0) {
        val outputRDD = rdd.repartition(partitionsEachInterval) // repartition as desired
        // to write to parquet directly in append mode in one directory per 'time'------------       
        val outputDF = outputRDD.toDF("tweetAsJsonString")
        // get some time fields from current `.Date()`
        val year = (new java.text.SimpleDateFormat("yyyy")).format(new java.util.Date())
        val month = (new java.text.SimpleDateFormat("MM")).format(new java.util.Date())
        val day = (new java.text.SimpleDateFormat("dd")).format(new java.util.Date())
        val hour = (new java.text.SimpleDateFormat("HH")).format(new java.util.Date())
        // write to a file with a clear time-based hierarchical directory structure for example
        outputDF.write.mode(SaveMode.Append)
                .parquet(outputDirectoryRoot+ "/"+ year + "/" + month + "/" + day + "/" + hour + "/" + time.milliseconds) 
        // end of writing as parquet file-------------------------------------
        numTweetsCollected += count // update with the latest count
      }
  })
partitionsEachInterval: Int = 1 batchInterval: Int = 1 timeoutJobLength: Int = 5 newContextCreated: Boolean = false numTweetsCollected: Long = 0
// ## Let's start the spark streaming context we have created next.
ssc.start()
// total tweets downloaded
numTweetsCollected
res13: Long = 1836
// ## Go to SparkUI and see if a streaming job is already running. If so you need to terminate it before starting a new streaming job. Only one streaming job can be run on the DB CE.
// #  let's stop the streaming job next.
ssc.stop(stopSparkContext = false) 
StreamingContext.getActive.foreach { _.stop(stopSparkContext = false) } 
// #Let's examine what was saved in dbfs
display(dbutils.fs.ls(outputDirectoryRoot))
dbfs:/datasets/tweetsStreamTmp/2017/2017/0
// Replace the date with current date
val date = "/2017/11/*"
val rawDF = fromParquetFile2DF(outputDirectoryRoot + date +"/*/*") //.cache()
val TTTsDF = tweetsDF2TTTDF(tweetsJsonStringDF2TweetsDF(rawDF)).cache()
date: String = /2017/11/* rawDF: org.apache.spark.sql.DataFrame = [tweetAsJsonString: string] TTTsDF: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [CurrentTweetDate: timestamp, CurrentTwID: bigint ... 33 more fields]
// Creating SQL table 
TTTsDF.createOrReplaceTempView("tbl_tweet")
sqlContext.sql("SELECT lang, CPostUserName, CurrentTweet FROM tbl_tweet LIMIT 10").collect.foreach(println)
[ja,⚫️🌏⚪️NAT💲UKI⚪️🌏⚫️,RT @she_is_lie: https://t.co/aGTKqpjHva] [en,☆Tesia☆D-1 WSD📌,Not that it matters but 38 minutes until I turn 18] [ja,ミナモン🦄🌙,@clubj_ 値段もそれなりだしね💦] [en,Pratik Raj IN,@ZeeNewsHindi Is apna muh bhi kala karwana chahiye Tha agar asal ki virodhi Hai to @MamataOfficial] [ja,なお,もういや。] [it,.,RT @baciamicoglione: m i p i a c e l a f i g a] [en,Mehboob,RT @raheelrana: میاں صاحب تسی وی شریف تواڈے وکیل وی شریف خدا دا واسطہ جے آپ اپنے آپ کو مت بدلو مگر اپنے وکیل کو بدل لو جس طرح… ] [en,きろゼロ🇸🇬,RT @jlist: When things are going really bad at work. https://t.co/0cqPLeKcPX] [ja,なべどん,AFA in 台湾✩ ずっと描いてた初めての海外ランウェイ。 イベントに申込んだ時からワクワクが止まらない。 皆さんの1pt1ptを、力を貸してください。 https://t.co/GcrQYqJ1MP #えんな https://t.co/XsCIFqxWbQ] [ja,ソト(8割空元気。),RT @Kono0425_ry: 愛知県の方気おつけて下さい。 車に傷つける(一本線)の被害が立て続けに起きてます。 自分の近所は安全だからと安心せずに保険に入ったりドラレコつけたりする事をオススメします。 見積もりだすと何十万、何百万です。… ]
// Checking the language of tweets
sqlContext.sql(
    "SELECT lang, COUNT(*) as cnt FROM tbl_tweet " +
    "GROUP BY lang ORDER BY cnt DESC limit 1000")
    .collect.foreach(println)
[en,626] [ja,513] [ko,142] [ar,107] [es,94] [pt,72] [th,67] [fr,49] [tr,38] [ru,31] [it,18] [ca,17] [id,16] [en-gb,13] [de,13] [zh-cn,10] [nl,8] [zh-CN,3] [fi,3] [sr,2] [hu,2] [el,2] [zh-TW,2] [en-GB,2] [pl,2] [vi,2] [zh-tw,1] [ro,1] [hr,1] [uk,1] [bg,1] [en-AU,1] [zh-Hant,1] [hi,1] [da,1]
// extracting just tweets from the table and converting it to String
val texts = sqlContext
      .sql("SELECT CurrentTweet from tbl_tweet")
      .map(_.toString)
texts: org.apache.spark.sql.Dataset[String] = [value: string]
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Vector, Vectors}
/*Create feature vectors by turning each tweet into bigrams of characters (an n-gram model)
and then hashing those to a length-1000 feature vector that we can pass to MLlib.*/

def featurize(s: String): Vector = {
  val n = 1000
  val result = new Array[Double](n)
  val bigrams = s.sliding(2).toArray
  for (h <- bigrams.map(_.hashCode % n)) {
    result(h) += 1.0 / bigrams.length
  }
  Vectors.sparse(n, result.zipWithIndex.filter(_._1 != 0).map(_.swap))
}
featurize: (s: String)org.apache.spark.mllib.linalg.Vector
//Cache the vectors RDD since it will be used for all the KMeans iterations.
val vectors = texts.rdd
      .map(featurize)
      .cache()
vectors: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.Vector] = MapPartitionsRDD[787] at map at command-2771931608832193:3
// cache is lazy so count will force the data to store in memory
vectors.count()
res21: Long = 1863
vectors.first()
res22: org.apache.spark.mllib.linalg.Vector = (1000,[50,53,56,78,96,99,100,180,189,226,285,325,340,350,356,358,370,438,453,488,504,525,554,573,578,587,615,623,626,636,642,660,669,679,708,712,755,830,845,903],[0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025,0.025])
// Training model with 10 cluster and 10 iteration
val model = KMeans.train(vectors, k=10, maxIterations = 10)
model: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@616925c8
// Sample 100 of the original set
val some_tweets = texts.take(100)
some_tweets: Array[String] = Array([RT @she_is_lie: https://t.co/aGTKqpjHva], [Not that it matters but 38 minutes until I turn 18], [@clubj_ 値段もそれなりだしね💦], [@ZeeNewsHindi Is apna muh bhi kala karwana chahiye Tha agar asal ki virodhi Hai to @MamataOfficial], [もういや。], [RT @baciamicoglione: m i p i a c e l a f i g a], [RT @raheelrana: میاں صاحب تسی وی شریف تواڈے وکیل وی شریف خدا دا واسطہ جے آپ اپنے آپ کو مت بدلو مگر اپنے وکیل کو بدل لو جس طرح… ], [RT @jlist: When things are going really bad at work. https://t.co/0cqPLeKcPX], [AFA in 台湾✩ ずっと描いてた初めての海外ランウェイ。 イベントに申込んだ時からワクワクが止まらない。 皆さんの1pt1ptを、力を貸してください。 https://t.co/GcrQYqJ1MP #えんな https://t.co/XsCIFqxWbQ], [RT @Kono0425_ry: 愛知県の方気おつけて下さい。 車に傷つける(一本線)の被害が立て続けに起きてます。 自分の近所は安全だからと安心せずに保険に入ったりドラレコつけたりする事をオススメします。 見積もりだすと何十万、何百万です。… ], [カエル寄りのナメクジです 難解な腐女子 ~生命の数だけ性癖はある~ | かおもじ #pixivコミック https://t.co/UJOQWDqp58], [RT @whatgirIsIove: no offence to me but wtf am i doing], [RT @yuyu_d: #無言で過去絵をあげる見た人もやる https://t.co/UFiaVVfHcj], [(فلما جاء أمرنا جعلنا عاليها سافلها وأمطرنا عليها حجارة من سجيل منضود) [هود:82] https://t.co/HTLfiMcgb3], [ربي أعوذ بك من الكسل وسوء الكبر https://t.co/jCbc2qxOlI], [RT @bellyinsmile: กล่อมน้องนอน #ชูใจ https://t.co/XmIecEtLLh], [RT @chortletown: 💵Pledge 4my life 😍ADOPT or FOSTER me ❤️me pls #A299815 https://t.co/IBTU2T7EkE #memphis via https://t.co/P7SzHSaTOA https…], [RT @yu_mene: ทีมงานบอกให้รอแบบสบายๆ 😂 https://t.co/bueHSWEqlc], [RT @fukumenkeimovie: \14日と15日の夜はLINE LIVE/ 志尊さん&小関さんの「#さしめし」😋❤️ 📱視聴予約をお忘れなく🎸キラキラ #覆面系ノイズ https://t.co/No46H6mOgq], [RT @Nopinochos: @joluga68 @CiudadanosCs @policia @guardiacivil @HoraJaen @jusapol #equiparacionya], [I'm at 名師橋 in Nagoya-shi, 愛知県 https://t.co/JAvvHX85nt], [RT @KylesHotTakes: And Doc Halladay grounds out to end the inning], [تقولي ما تستحين على وجهج مصبغة أظافر رجولج شنو يعني اغراء ؟! اذبحها], [bla bla bla https://t.co/1VmXZk9rRH], [@Aghaye_Biikhat اره دیدم😃], [@jalalaeddine @S_ALKarashi سبحان الله تدافع عن الصوفية وحزب البعث الإشتراكي... وتريد منا أن نقبل كلامك؟], [@Kiekkokirja Livetuloksista. En oo 100% varma onko tuo totta kun siellä on joskus väärää tietoa esim. kokoonpanoissa], [ガチマでは ホコが勝てない(T^T) リグマだとフレンドさんが 心強くて勝てますが(っ´ω`c)], [@carol19761112 お頼みして描いて頂きました☺️可愛いですよね✨], [@kero_hugu 😂残念! あなたには特別クーポンをさしあげます❤ 特別クーポンで、KFC秘伝の香りを楽しんでね🍗 明日もキャンペーンに参加して「チキンの香り?の入浴剤」をゲットしよう👍 #KFC https://t.co/1ESHjKp7di https://t.co/RPqtuDwyNE], [tesudo que só https://t.co/66yaIRfI6W], [RT @ciilat_gko: 訂正と補足。免疫が「発達した」は「低下した」の間違いですごめんなさい! 詳しく言うと、狩猟時代に男性のケガや病気が増加→免疫が過剰反応し、自己免疫異常が発生→進化の過程で男性の免疫システムが非活動的に変化した…という流れです。うろ覚えで文がおかし…], [RT @ktrsngofficial: フォロワー100万人突破!皆さまのパワーでありがとうございます!のんびり楽しんでイキましょう! インスタグラマー香取慎吾! 本格始動です! ハロー #ホンネロス https://t.co/PDPINoUQPI], [My bunkmates now dw give me a new stick after i break it in half 😂😂😂😂], [RT @oz_il11: タオル全メンバー全完売は草 https://t.co/IgqiUtOPMS], [RT @Jawara_B411: Menyikapi kabar pembubaran Dakwah yang ramaikan diperbincangkan belakangan ini. Kami dari DPP BRIGADE JAWARA BETAWI… ], [RT @euneun13st: แต่งใหญ่สุด หมายถึงชุดใหญ่สุด เข้าประตูมายังไงนิ https://t.co/ct3krCx0Hr], [RT @rolctSsnyBfVNAi: 무닌😭 https://t.co/GNceC6XwVd], [RT @serikon_mha: ヒーローは神ではない https://t.co/SLmxMMk6v2], [@_A_n_d_r_e_w_s @BoobPunchTina Rut-Roh. Set much of my first novel in #Newfoundland I may be in deep trouble, puffin-wise... #amwriting], [@gxgarea Temenin aku juga....], [@nyanmarubl にゃんちゃんたらwwwそうそう、お願い決まった?], [@geahcintun เป็นอยู่ทุกวัน..], [RT @Tumyad4: Sabah 6 da büyük kızı okula bırakıyor öglene kadar kagıt topluyor heryer arap, afgan ,pakistanlı dolu abi inşaata g… ], [(햇는데 아버지랑 떠야하는건아니겟지;)], [RT @thebtsmutuals: rt if u love min yoongi , follow whoever rts 🍁], [RT @DepreFogo: Hoje Tem Jogo Do Botafogo O Glorioso É O Meu Grande Amor Te Amo Fogo 🔥], [RT @Sanchovies: was dming this girl but I had to block her lol just seen she had "Matthew 6:8" in her bio no way I can compete with a fella…], [miga sai dessa que é furada, confia na tia], [@feiler_wt @marum0t1 ダメです https://t.co/biA9YW0Jof], [@ron_im_iz Will do :)], [RT #MPN #BadLiar #SelenaGomezInstagram https://t.co/zoj3xMZqrI], [RT @baejinyoung_00: 진영이 압구정역 광고 너무 예뻐💧 #배진영 #BAEJINYOUNG https://t.co/5iA3Wp8Eux], [ポチれよ], [RT @sapinker: Augmented Reality Glasses for Autism | Innovative new tech for kids, aids emotions, self-regulation, more. Indiegogo https://…], [Balkanci u Nemačkoj za pet godina zarade novac za koji u svojoj zemlji rade ceo život https://t.co/c6vhcoa2zu], [alguien me mata para no ir al colegio], [RT @Story_terror: はいかわいい #cingeki https://t.co/gZFG9I9FmM], [古戦場はゆるゆるやる。 できればガチャ回しまくりたい], [RT @proofhealth: Proof #blockchain and Health Minister of Ontario, Dr. Eric Hoskins discuss #smartcontracts #privacy #security… ], [RT @GLove39: Siri, show me a visual metaphor for tax avoidance #ParadisePapers https://t.co/wVdSy7QtMZ], [RT @Fnac: JEU CONCOURS: À l'occasion du #SalonPhotoParis qui débute demain, tente de gagner le @FujiInstaxFR mini 9 bleu givr… ], [https://t.co/0VPlrWxm0a], [RT @twittakarai: 今月発売のDear+でテンカウントが最終話となります(•ㅿ•.)(ㅍ_ㅍ) 4年強の間、応援のリプライやお手紙ややさしいイラストなどに心が救われる思いでした。本当にありがとうございました! 最終巻の6巻は春ごろ発売になります。ど… ], [@diary_ER1N_ 갠차노], [.明月不谙离恨苦,斜光到晓穿朱户.-晏殊《鹊踏枝》], [RT @_Tazmxni: Mon pote il m'accompagne au magasin et pour l'remercier j'lui dis prend un bail, ce fou il avait l'intention d'acheter une co…], [RT @taekookmoments: can't believe taekook checked what letter each other got to see which team they're in wHAT 👀👀 https://t.co/dsNi9QLzJS], [RT @famichikisenpai: #どん兵衛こわい 変なネタいっぱい送ってきた…皆さん早くもらってください → フォロー&RTで合計3万名様にどん兵衛プレゼント。2日目は9日11:59まで #ファミどん なんて知らない https://t.co/Mmvr5BeIzV h…], [jwu], [씻고오겟슴다], [RT @ksngtysofficial: ###ご機嫌斜め!@## #ユーチューバー草彅 #ホンネテレビ https://t.co/ySvYTr4z52], [RT @mybraceteeth: คนไข้ผมวันนี้ 27ปี ฟันหน้ากร่อนเป็นหยักๆๆ ซักประวัติพบว่า ชอบดื่มน้ำอัดลมมาตั้งแต่เด็ก ดื่มแล้วจะจิบๆบริเวณฟันหน้า… ], [Bluetoothイヤホンなくした、、、こいつぁショックでかめ_(-ω-`_)⌒)_], [Sendo fofas na Disneyland!! 🎢🏰😍Apenas as melhores!! Já estou com saudades!! 😭😭😩 #disneyland… https://t.co/avY5bVcSmW], [RT @Diamond_97jk: 171104 WINGS Macau #방타소년단 #BTS #정국 #JUNGKOOK https://t.co/qZFQOYGA09], [2 MCs are so done 😂their face lol], [RT @propositey: sou adolescente e odeio adolescente na adolescência], [RT @vthomasm: Corrupción. La caja B del PP. El policía que destapó Gürtel acusa al PP de haber intentado “desestabilizar la investigación”…], [RT @CrushOn2226: 20171107 #MONSTA_X #몬스타엑스 #아이엠 #IM #기현 #KIHYUN #DRAMARAMA #드라마라마 @OfficialMonstaX SHOW CON DRAMARAMA 기현 focus ful… ], [RT @rcabrero75: Y Rivera y Pedro Sánchez ande andarán.. Ensordecedor silencio ante la gravedad de lo que ha ocurrido hoy en el Con… ], [@henriquemenes @oretalha ps4 mo ruim], [好み似てるところある], [@hayashida_sannn 元気もらえる😄], [(وإن الذين لا يؤمنون بالآخرة عن الصراط لناكبون) [المؤمنون:74] https://t.co/fRdUMQnNOD], [RT @PostMonstaX: [INFO] Hoje, às 8 da manhã (Horário de Brasília) vai ao ar o ShowChampion, sendo este o primeiro stage de… ], [Check out what I'm selling on my @depop shop 💸 https://t.co/Rkr3CFf14D], [RT @saku93: doaxシリーズ女天狗 #いいおっぱいの日 https://t.co/EMsvtdnWh3], [Vou volta a dormir muito tô cheio de sono], [RT @JosPastr: Por fin los Mossos hacen algo: proteger a los piquetes ante quien se atreva a encararse con ellos. https://t.co/tc5BLBCwKu], [RT @RaiLetteratura: 170 anni fa nasceva Bram #Stoker autore di #Dracula il celeberrimo romanzo #gotico, ne parla Alessandro Zaccuri… ], [RT @TH3WH17ERABB17: What must occur to allow for civilian trials? #followthewhiterabbit 🐇 https://t.co/MlGOGwp0e9], [@kr1h_ アーラシュくんが57で止まってる悲しい現実], [@valerie_expert @SophiePendevill @ARTEfr Julie Peyrard "L'accord pour le namming du musée cours pour 30 ans et a coûté 1 milliard tout accords compris." #louvreabudhabi], [パクりました ドーンガード暁の護衛説 https://t.co/2hX7gDq3Xc], [デザイン気になるね😌 https://t.co/Oz1WbFpgYb], [#Criciuma #PracaChamine #Centenario #IoT #SSP #Comunidade https://t.co/uCVYeB7aZc], [[MV] 어반자카파 - 그때의 나, 그때의 우리 https://t.co/xL6snkfhho https://t.co/WfA8UslpS0], [나 빱빠YA ㅅㅏ랑함 https://t.co/SwnvJhU9ff], [JKの新教祖?イタイ歌詞が人気の歌手「阿部真央」])
// iterate through the 100 samples and show which cluster they are in
for (i <- 0 until 10) {
  println(s"\nCLUSTER $i:")
  some_tweets.foreach { t =>
    if (model.predict(featurize(t)) == i) {
      println(t)
    }
  }
}
CLUSTER 0: [RT @she_is_lie: https://t.co/aGTKqpjHva] [RT @jlist: When things are going really bad at work. https://t.co/0cqPLeKcPX] [AFA in 台湾✩ ずっと描いてた初めての海外ランウェイ。 イベントに申込んだ時からワクワクが止まらない。 皆さんの1pt1ptを、力を貸してください。 https://t.co/GcrQYqJ1MP #えんな https://t.co/XsCIFqxWbQ] [カエル寄りのナメクジです 難解な腐女子 ~生命の数だけ性癖はある~ | かおもじ #pixivコミック https://t.co/UJOQWDqp58] [RT @yuyu_d: #無言で過去絵をあげる見た人もやる https://t.co/UFiaVVfHcj] [(فلما جاء أمرنا جعلنا عاليها سافلها وأمطرنا عليها حجارة من سجيل منضود) [هود:82] https://t.co/HTLfiMcgb3] [ربي أعوذ بك من الكسل وسوء الكبر https://t.co/jCbc2qxOlI] [RT @bellyinsmile: กล่อมน้องนอน #ชูใจ https://t.co/XmIecEtLLh] [RT @chortletown: 💵Pledge 4my life 😍ADOPT or FOSTER me ❤️me pls #A299815 https://t.co/IBTU2T7EkE #memphis via https://t.co/P7SzHSaTOA https…] [RT @yu_mene: ทีมงานบอกให้รอแบบสบายๆ 😂 https://t.co/bueHSWEqlc] [I'm at 名師橋 in Nagoya-shi, 愛知県 https://t.co/JAvvHX85nt] [bla bla bla https://t.co/1VmXZk9rRH] [@kero_hugu 😂残念! あなたには特別クーポンをさしあげます❤ 特別クーポンで、KFC秘伝の香りを楽しんでね🍗 明日もキャンペーンに参加して「チキンの香り?の入浴剤」をゲットしよう👍 #KFC https://t.co/1ESHjKp7di https://t.co/RPqtuDwyNE] [tesudo que só https://t.co/66yaIRfI6W] [RT @oz_il11: タオル全メンバー全完売は草 https://t.co/IgqiUtOPMS] [RT @euneun13st: แต่งใหญ่สุด หมายถึงชุดใหญ่สุด เข้าประตูมายังไงนิ https://t.co/ct3krCx0Hr] [RT @rolctSsnyBfVNAi: 무닌😭 https://t.co/GNceC6XwVd] [RT @serikon_mha: ヒーローは神ではない https://t.co/SLmxMMk6v2] [@feiler_wt @marum0t1 ダメです https://t.co/biA9YW0Jof] [RT #MPN #BadLiar #SelenaGomezInstagram https://t.co/zoj3xMZqrI] [RT @baejinyoung_00: 진영이 압구정역 광고 너무 예뻐💧 #배진영 #BAEJINYOUNG https://t.co/5iA3Wp8Eux] [RT @Story_terror: はいかわいい #cingeki https://t.co/gZFG9I9FmM] [https://t.co/0VPlrWxm0a] [RT @ksngtysofficial: ###ご機嫌斜め!@## #ユーチューバー草彅 #ホンネテレビ https://t.co/ySvYTr4z52] [RT @Diamond_97jk: 171104 WINGS Macau #방타소년단 #BTS #정국 #JUNGKOOK https://t.co/qZFQOYGA09] [(وإن الذين لا يؤمنون بالآخرة عن الصراط لناكبون) [المؤمنون:74] https://t.co/fRdUMQnNOD] [Check out what I'm selling on my @depop shop 💸 https://t.co/Rkr3CFf14D] [RT @saku93: doaxシリーズ女天狗 #いいおっぱいの日 https://t.co/EMsvtdnWh3] [パクりました ドーンガード暁の護衛説 https://t.co/2hX7gDq3Xc] [デザイン気になるね😌 https://t.co/Oz1WbFpgYb] [#Criciuma #PracaChamine #Centenario #IoT #SSP #Comunidade https://t.co/uCVYeB7aZc] [[MV] 어반자카파 - 그때의 나, 그때의 우리 https://t.co/xL6snkfhho https://t.co/WfA8UslpS0] [나 빱빠YA ㅅㅏ랑함 https://t.co/SwnvJhU9ff] CLUSTER 1: [RT @baciamicoglione: m i p i a c e l a f i g a] [RT @raheelrana: میاں صاحب تسی وی شریف تواڈے وکیل وی شریف خدا دا واسطہ جے آپ اپنے آپ کو مت بدلو مگر اپنے وکیل کو بدل لو جس طرح… ] [RT @Kono0425_ry: 愛知県の方気おつけて下さい。 車に傷つける(一本線)の被害が立て続けに起きてます。 自分の近所は安全だからと安心せずに保険に入ったりドラレコつけたりする事をオススメします。 見積もりだすと何十万、何百万です。… ] [RT @fukumenkeimovie: \14日と15日の夜はLINE LIVE/ 志尊さん&小関さんの「#さしめし」😋❤️ 📱視聴予約をお忘れなく🎸キラキラ #覆面系ノイズ https://t.co/No46H6mOgq] [تقولي ما تستحين على وجهج مصبغة أظافر رجولج شنو يعني اغراء ؟! اذبحها] [@jalalaeddine @S_ALKarashi سبحان الله تدافع عن الصوفية وحزب البعث الإشتراكي... وتريد منا أن نقبل كلامك؟] [ガチマでは ホコが勝てない(T^T) リグマだとフレンドさんが 心強くて勝てますが(っ´ω`c)] [@carol19761112 お頼みして描いて頂きました☺️可愛いですよね✨] [RT @ciilat_gko: 訂正と補足。免疫が「発達した」は「低下した」の間違いですごめんなさい! 詳しく言うと、狩猟時代に男性のケガや病気が増加→免疫が過剰反応し、自己免疫異常が発生→進化の過程で男性の免疫システムが非活動的に変化した…という流れです。うろ覚えで文がおかし…] [RT @ktrsngofficial: フォロワー100万人突破!皆さまのパワーでありがとうございます!のんびり楽しんでイキましょう! インスタグラマー香取慎吾! 本格始動です! ハロー #ホンネロス https://t.co/PDPINoUQPI] [@geahcintun เป็นอยู่ทุกวัน..] [(햇는데 아버지랑 떠야하는건아니겟지;)] [RT @DepreFogo: Hoje Tem Jogo Do Botafogo O Glorioso É O Meu Grande Amor Te Amo Fogo 🔥] [古戦場はゆるゆるやる。 できればガチャ回しまくりたい] [RT @twittakarai: 今月発売のDear+でテンカウントが最終話となります(•ㅿ•.)(ㅍ_ㅍ) 4年強の間、応援のリプライやお手紙ややさしいイラストなどに心が救われる思いでした。本当にありがとうございました! 最終巻の6巻は春ごろ発売になります。ど… ] [@diary_ER1N_ 갠차노] [.明月不谙离恨苦,斜光到晓穿朱户.-晏殊《鹊踏枝》] [RT @famichikisenpai: #どん兵衛こわい 変なネタいっぱい送ってきた…皆さん早くもらってください → フォロー&RTで合計3万名様にどん兵衛プレゼント。2日目は9日11:59まで #ファミどん なんて知らない https://t.co/Mmvr5BeIzV h…] [씻고오겟슴다] [RT @mybraceteeth: คนไข้ผมวันนี้ 27ปี ฟันหน้ากร่อนเป็นหยักๆๆ ซักประวัติพบว่า ชอบดื่มน้ำอัดลมมาตั้งแต่เด็ก ดื่มแล้วจะจิบๆบริเวณฟันหน้า… ] [Bluetoothイヤホンなくした、、、こいつぁショックでかめ_(-ω-`_)⌒)_] [RT @CrushOn2226: 20171107 #MONSTA_X #몬스타엑스 #아이엠 #IM #기현 #KIHYUN #DRAMARAMA #드라마라마 @OfficialMonstaX SHOW CON DRAMARAMA 기현 focus ful… ] [JKの新教祖?イタイ歌詞が人気の歌手「阿部真央」] CLUSTER 2: [もういや。] CLUSTER 3: CLUSTER 4: CLUSTER 5: [jwu] CLUSTER 6: [Not that it matters but 38 minutes until I turn 18] [@ZeeNewsHindi Is apna muh bhi kala karwana chahiye Tha agar asal ki virodhi Hai to @MamataOfficial] [RT @whatgirIsIove: no offence to me but wtf am i doing] [RT @Nopinochos: @joluga68 @CiudadanosCs @policia @guardiacivil @HoraJaen @jusapol #equiparacionya] [RT @KylesHotTakes: And Doc Halladay grounds out to end the inning] [@Kiekkokirja Livetuloksista. En oo 100% varma onko tuo totta kun siellä on joskus väärää tietoa esim. kokoonpanoissa] [My bunkmates now dw give me a new stick after i break it in half 😂😂😂😂] [RT @Jawara_B411: Menyikapi kabar pembubaran Dakwah yang ramaikan diperbincangkan belakangan ini. Kami dari DPP BRIGADE JAWARA BETAWI… ] [@_A_n_d_r_e_w_s @BoobPunchTina Rut-Roh. Set much of my first novel in #Newfoundland I may be in deep trouble, puffin-wise... #amwriting] [@gxgarea Temenin aku juga....] [RT @Tumyad4: Sabah 6 da büyük kızı okula bırakıyor öglene kadar kagıt topluyor heryer arap, afgan ,pakistanlı dolu abi inşaata g… ] [RT @thebtsmutuals: rt if u love min yoongi , follow whoever rts 🍁] [RT @Sanchovies: was dming this girl but I had to block her lol just seen she had "Matthew 6:8" in her bio no way I can compete with a fella…] [miga sai dessa que é furada, confia na tia] [RT @sapinker: Augmented Reality Glasses for Autism | Innovative new tech for kids, aids emotions, self-regulation, more. Indiegogo https://…] [Balkanci u Nemačkoj za pet godina zarade novac za koji u svojoj zemlji rade ceo život https://t.co/c6vhcoa2zu] [alguien me mata para no ir al colegio] [RT @proofhealth: Proof #blockchain and Health Minister of Ontario, Dr. Eric Hoskins discuss #smartcontracts #privacy #security… ] [RT @GLove39: Siri, show me a visual metaphor for tax avoidance #ParadisePapers https://t.co/wVdSy7QtMZ] [RT @Fnac: JEU CONCOURS: À l'occasion du #SalonPhotoParis qui débute demain, tente de gagner le @FujiInstaxFR mini 9 bleu givr… ] [RT @_Tazmxni: Mon pote il m'accompagne au magasin et pour l'remercier j'lui dis prend un bail, ce fou il avait l'intention d'acheter une co…] [RT @taekookmoments: can't believe taekook checked what letter each other got to see which team they're in wHAT 👀👀 https://t.co/dsNi9QLzJS] [Sendo fofas na Disneyland!! 🎢🏰😍Apenas as melhores!! Já estou com saudades!! 😭😭😩 #disneyland… https://t.co/avY5bVcSmW] [2 MCs are so done 😂their face lol] [RT @propositey: sou adolescente e odeio adolescente na adolescência] [RT @vthomasm: Corrupción. La caja B del PP. El policía que destapó Gürtel acusa al PP de haber intentado “desestabilizar la investigación”…] [RT @rcabrero75: Y Rivera y Pedro Sánchez ande andarán.. Ensordecedor silencio ante la gravedad de lo que ha ocurrido hoy en el Con… ] [@henriquemenes @oretalha ps4 mo ruim] [RT @PostMonstaX: [INFO] Hoje, às 8 da manhã (Horário de Brasília) vai ao ar o ShowChampion, sendo este o primeiro stage de… ] [Vou volta a dormir muito tô cheio de sono] [RT @JosPastr: Por fin los Mossos hacen algo: proteger a los piquetes ante quien se atreva a encararse con ellos. https://t.co/tc5BLBCwKu] [RT @RaiLetteratura: 170 anni fa nasceva Bram #Stoker autore di #Dracula il celeberrimo romanzo #gotico, ne parla Alessandro Zaccuri… ] [RT @TH3WH17ERABB17: What must occur to allow for civilian trials? #followthewhiterabbit 🐇 https://t.co/MlGOGwp0e9] [@valerie_expert @SophiePendevill @ARTEfr Julie Peyrard "L'accord pour le namming du musée cours pour 30 ans et a coûté 1 milliard tout accords compris." #louvreabudhabi] CLUSTER 7: CLUSTER 8: [@clubj_ 値段もそれなりだしね💦] [@Aghaye_Biikhat اره دیدم😃] [@nyanmarubl にゃんちゃんたらwwwそうそう、お願い決まった?] [@ron_im_iz Will do :)] [ポチれよ] [好み似てるところある] [@hayashida_sannn 元気もらえる😄] [@kr1h_ アーラシュくんが57で止まってる悲しい現実] CLUSTER 9:
// to remove a pre-existing model and start from scratch
dbutils.fs.rm("/datasets/model", true) 
res24: Boolean = false
// save the model
sc.makeRDD(model.clusterCenters).saveAsObjectFile("/datasets/model")
import org.apache.spark.mllib.clustering.KMeans
import org.apache.spark.mllib.linalg.{Vector, Vectors}
import org.apache.spark.mllib.clustering.KMeansModel
import org.apache.spark.mllib.clustering.KMeans import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.clustering.KMeansModel
// Checking if the model works
val clusterNumber = 5

val modelFile = "/datasets/model"

val model: KMeansModel = new KMeansModel(sc.objectFile[Vector](modelFile).collect)
model.predict(featurize("واحد صاحبى لو حد يعرف اكونت وزير التعليم ")) == clusterNumber
clusterNumber: Int = 5 modelFile: String = /datasets/model model: org.apache.spark.mllib.clustering.KMeansModel = org.apache.spark.mllib.clustering.KMeansModel@4b53f956 res26: Boolean = false