02_annotation_2_NerDL_WordEmbeddingsPreTrainedModels(Scala)
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.annotators.ner.NerConverter
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.SparkNLP

import com.johnsnowlabs.util.Benchmark
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.annotators.ner.NerConverter import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.SparkNLP import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession
// If the SparkSession doesn't exist, this command will create it for you. Otherwise, it re-use the current SparkSession
val spark = SparkNLP.start()
spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@74dcea28
val document = new DocumentAssembler()
  .setInputCol("text")
  .setOutputCol("document")

val sentence = new SentenceDetector()
    .setInputCols(Array("document"))
    .setOutputCol("sentence")

val token = new Tokenizer()
  .setInputCols("document")
  .setOutputCol("token")

val normalizer = new Normalizer()
  .setInputCols("token")
  .setOutputCol("normal")

val embeddings = WordEmbeddingsModel.pretrained()
   .setOutputCol("embeddings")

val ner = NerDLModel.pretrained()
  .setInputCols("document", "token", "embeddings")
  .setOutputCol("ner")

val nerConverter = new NerConverter()
  .setInputCols("document", "token", "ner")
  .setOutputCol("ner_converter")

val finisher = new Finisher()
  .setInputCols("ner", "ner_converter")
  .setIncludeMetadata(true)
  .setOutputAsArray(true)
  .setCleanAnnotations(false)
  .setAnnotationSplitSymbol("@")
  .setValueSplitSymbol("#")

val pipeline = new Pipeline().setStages(
  Array(
 document, 
    sentence,
    token, 
    normalizer,
    embeddings,
    ner, 
    nerConverter,
    finisher
  )
)
document: com.johnsnowlabs.nlp.DocumentAssembler = document_4ef798c59cce sentence: com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector = SENTENCE_bc41b6102915 token: com.johnsnowlabs.nlp.annotators.Tokenizer = REGEX_TOKENIZER_7d063a8a5b4b normalizer: com.johnsnowlabs.nlp.annotators.Normalizer = NORMALIZER_582ef596df3a embeddings: com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel = WORD_EMBEDDINGS_MODEL_2f4ad586e8a2 ner: com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel = NerDLModel_06f049935f0f nerConverter: com.johnsnowlabs.nlp.annotators.ner.NerConverter = NER_CONVERTER_8eb1b0ed5513 finisher: com.johnsnowlabs.nlp.Finisher = finisher_e68a67dba5b8 pipeline: org.apache.spark.ml.Pipeline = pipeline_b77c27f8f3be
// this is our testing dataset
val testing = Seq(
  (1, """
  Google LLC is an American multinational technology company that specializes in Internet-related  services and products, which include online advertising technologies, search engine, cloud computing, software, and hardware. It is considered one of the Big Four technology companies, alongside Amazon, Apple and Facebook.
  """),
  (2, """
  Spider-Man is a fictional superhero created by writer-editor Stan Lee and writer-artist Steve Ditko. He first appeared in the anthology comic book Amazing Fantasy #15 in the Silver Age of Comic Books.
  """)
).toDS.toDF( "_id", "text")

// keep in mind we don't need to train anything in our pipeline since it comes already pre-trained,
// so we go ahead and use the same testing dataset in .fit() stage as well
val result = Benchmark.time("Time to convert and show") {
  pipeline.fit(testing).transform(testing)
}

result.select("finished_ner").show(truncate=false)
result.select("finished_ner_converter").show(truncate=false)

result.printSchema
Time to convert and show: 17.459222174sec +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |finished_ner | +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ |[I-ORG, I-ORG, O, O, I-MISC, O, O, O, O, O, O, I-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O, O, O, O, I-ORG, O, I-ORG, O, I-ORG, O]| |[I-PER, O, O, O, O, O, O, O, I-PER, I-PER, O, O, I-PER, I-PER, O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, I-MISC, I-MISC, I-MISC, I-MISC, I-MISC, O] | +-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ +-----------------------------------------------------------------------------------+ |finished_ner_converter | +-----------------------------------------------------------------------------------+ |[Google LLC, American, Internet-related, Big Four, Amazon, Apple, Facebook] | |[Spider-Man, Stan Lee, Steve Ditko, Amazing Fantasy #15, Silver Age of Comic Books]| +-----------------------------------------------------------------------------------+ root |-- _id: integer (nullable = false) |-- text: string (nullable = true) |-- document: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- sentence: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- token: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- normal: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- embeddings: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- ner: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- ner_converter: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- annotatorType: string (nullable = true) | | |-- begin: integer (nullable = false) | | |-- end: integer (nullable = false) | | |-- result: string (nullable = true) | | |-- metadata: map (nullable = true) | | | |-- key: string | | | |-- value: string (valueContainsNull = true) | | |-- embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) | | |-- sentence_embeddings: array (nullable = true) | | | |-- element: float (containsNull = false) |-- finished_ner: array (nullable = true) | |-- element: string (containsNull = true) |-- finished_ner_converter: array (nullable = true) | |-- element: string (containsNull = true) |-- finished_ner_metadata: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- _1: string (nullable = true) | | |-- _2: string (nullable = true) |-- finished_ner_converter_metadata: array (nullable = true) | |-- element: struct (containsNull = true) | | |-- _1: string (nullable = true) | | |-- _2: string (nullable = true) testing: org.apache.spark.sql.DataFrame = [_id: int, text: string] result: org.apache.spark.sql.DataFrame = [_id: int, text: string ... 11 more fields]