import com.johnsnowlabs.nlp.annotator._ import com.johnsnowlabs.nlp.annotators.ner.NerConverter import com.johnsnowlabs.nlp.base._ import com.johnsnowlabs.nlp.SparkNLP import com.johnsnowlabs.util.Benchmark import org.apache.spark.ml.Pipeline import org.apache.spark.sql.SparkSession
import com.johnsnowlabs.nlp.annotator._
import com.johnsnowlabs.nlp.annotators.ner.NerConverter
import com.johnsnowlabs.nlp.base._
import com.johnsnowlabs.nlp.SparkNLP
import com.johnsnowlabs.util.Benchmark
import org.apache.spark.ml.Pipeline
import org.apache.spark.sql.SparkSession
val document = new DocumentAssembler() .setInputCol("text") .setOutputCol("document") val sentence = new SentenceDetector() .setInputCols(Array("document")) .setOutputCol("sentence") val token = new Tokenizer() .setInputCols("document") .setOutputCol("token") val normalizer = new Normalizer() .setInputCols("token") .setOutputCol("normal") val embeddings = WordEmbeddingsModel.pretrained() .setOutputCol("embeddings") val ner = NerDLModel.pretrained() .setInputCols("document", "token", "embeddings") .setOutputCol("ner") val nerConverter = new NerConverter() .setInputCols("document", "token", "ner") .setOutputCol("ner_converter") val finisher = new Finisher() .setInputCols("ner", "ner_converter") .setIncludeMetadata(true) .setOutputAsArray(true) .setCleanAnnotations(false) .setAnnotationSplitSymbol("@") .setValueSplitSymbol("#") val pipeline = new Pipeline().setStages( Array( document, sentence, token, normalizer, embeddings, ner, nerConverter, finisher ) )
document: com.johnsnowlabs.nlp.DocumentAssembler = document_4ef798c59cce
sentence: com.johnsnowlabs.nlp.annotators.sbd.pragmatic.SentenceDetector = SENTENCE_bc41b6102915
token: com.johnsnowlabs.nlp.annotators.Tokenizer = REGEX_TOKENIZER_7d063a8a5b4b
normalizer: com.johnsnowlabs.nlp.annotators.Normalizer = NORMALIZER_582ef596df3a
embeddings: com.johnsnowlabs.nlp.embeddings.WordEmbeddingsModel = WORD_EMBEDDINGS_MODEL_2f4ad586e8a2
ner: com.johnsnowlabs.nlp.annotators.ner.dl.NerDLModel = NerDLModel_06f049935f0f
nerConverter: com.johnsnowlabs.nlp.annotators.ner.NerConverter = NER_CONVERTER_8eb1b0ed5513
finisher: com.johnsnowlabs.nlp.Finisher = finisher_e68a67dba5b8
pipeline: org.apache.spark.ml.Pipeline = pipeline_b77c27f8f3be
// this is our testing dataset val testing = Seq( (1, """ Google LLC is an American multinational technology company that specializes in Internet-related services and products, which include online advertising technologies, search engine, cloud computing, software, and hardware. It is considered one of the Big Four technology companies, alongside Amazon, Apple and Facebook. """), (2, """ Spider-Man is a fictional superhero created by writer-editor Stan Lee and writer-artist Steve Ditko. He first appeared in the anthology comic book Amazing Fantasy #15 in the Silver Age of Comic Books. """) ).toDS.toDF( "_id", "text") // keep in mind we don't need to train anything in our pipeline since it comes already pre-trained, // so we go ahead and use the same testing dataset in .fit() stage as well val result = Benchmark.time("Time to convert and show") { pipeline.fit(testing).transform(testing) } result.select("finished_ner").show(truncate=false) result.select("finished_ner_converter").show(truncate=false) result.printSchema
Time to convert and show: 17.459222174sec
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|finished_ner |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[I-ORG, I-ORG, O, O, I-MISC, O, O, O, O, O, O, I-MISC, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, O, O, O, O, I-ORG, O, I-ORG, O, I-ORG, O]|
|[I-PER, O, O, O, O, O, O, O, I-PER, I-PER, O, O, I-PER, I-PER, O, O, O, O, O, O, O, O, O, I-MISC, I-MISC, I-MISC, O, O, I-MISC, I-MISC, I-MISC, I-MISC, I-MISC, O] |
+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+-----------------------------------------------------------------------------------+
|finished_ner_converter |
+-----------------------------------------------------------------------------------+
|[Google LLC, American, Internet-related, Big Four, Amazon, Apple, Facebook] |
|[Spider-Man, Stan Lee, Steve Ditko, Amazing Fantasy #15, Silver Age of Comic Books]|
+-----------------------------------------------------------------------------------+
root
|-- _id: integer (nullable = false)
|-- text: string (nullable = true)
|-- document: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- sentence: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- token: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- normal: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- embeddings: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- ner: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- ner_converter: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- annotatorType: string (nullable = true)
| | |-- begin: integer (nullable = false)
| | |-- end: integer (nullable = false)
| | |-- result: string (nullable = true)
| | |-- metadata: map (nullable = true)
| | | |-- key: string
| | | |-- value: string (valueContainsNull = true)
| | |-- embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
| | |-- sentence_embeddings: array (nullable = true)
| | | |-- element: float (containsNull = false)
|-- finished_ner: array (nullable = true)
| |-- element: string (containsNull = true)
|-- finished_ner_converter: array (nullable = true)
| |-- element: string (containsNull = true)
|-- finished_ner_metadata: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _1: string (nullable = true)
| | |-- _2: string (nullable = true)
|-- finished_ner_converter_metadata: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- _1: string (nullable = true)
| | |-- _2: string (nullable = true)
testing: org.apache.spark.sql.DataFrame = [_id: int, text: string]
result: org.apache.spark.sql.DataFrame = [_id: int, text: string ... 11 more fields]
SDS-2.x, Scalable Data Engineering Science
This is a minor augnmentation/update of:
Last refresh: Never