ScaDaMaLe Course site and book

/scalable-data-science/000_0-sds-3-x-projects/student-project-01_group-TheTwoCultures/01_load_data
import org.apache.spark.sql.functions.{col, concat_ws, udf, flatten, explode, collect_list, collect_set, lit} import org.apache.spark.sql.types.{ArrayType, StructType, StructField, StringType, IntegerType} import com.databricks.spark.xml._ import org.apache.spark.sql.functions._ read_xml: (file_name: String)org.apache.spark.sql.DataFrame get_dataset: (file_name: String)org.apache.spark.sql.DataFrame
save_df: (df: org.apache.spark.sql.DataFrame, filePath: String)Unit load_df: (filePath: String)org.apache.spark.sql.DataFrame no_forums: (df: org.apache.spark.sql.DataFrame)Long
dbfs:/datasets/student-project-01/flashback/familjeliv-allmanna-ekonomi_df familjeliv-allmanna-ekonomi_df already exists! dbfs:/datasets/student-project-01/flashback/familjeliv-sexsamlevnad_df familjeliv-sexsamlevnad_df already exists! dbfs:/datasets/student-project-01/flashback/flashback-ekonomi_df flashback-ekonomi_df already exists! dbfs:/datasets/student-project-01/flashback/flashback-sex_df flashback-sex_df already exists! fl_root: String = dbfs:/datasets/student-project-01/familjeliv/ fb_root: String = dbfs:/datasets/student-project-01/flashback/ fl_data: Array[String] = Array(familjeliv-allmanna-ekonomi, familjeliv-sexsamlevnad) fb_data: Array[String] = Array(flashback-ekonomi, flashback-sex)
var file_name = "dbfs:/datasets/student-project-01/familjeliv/familjeliv-allmanna-ekonomi.xml" var xml_df = read_xml(file_name).cache() var df = get_dataset(file_name).cache()
file_name: String = dbfs:/datasets/student-project-01/familjeliv/familjeliv-allmanna-ekonomi.xml xml_df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [_id: string, _title: string ... 2 more fields] df: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [thread_id: string, thread_title: string ... 5 more fields]
xml_df.printSchema()
root |-- _id: string (nullable = false) |-- _title: string (nullable = false) |-- _url: string (nullable = false) |-- thread: array (nullable = false) | |-- element: struct (containsNull = true) | | |-- _id: string (nullable = false) | | |-- _title: string (nullable = false) | | |-- _url: string (nullable = false) | | |-- text: struct (nullable = false) | | | |-- sentence: array (nullable = false) | | | | |-- element: struct (containsNull = true) | | | | | |-- w: array (nullable = true) | | | | | | |-- element: string (containsNull = true)
xml_df.show(10)
+------+--------------------+--------------------+--------------------+ | _id| _title| _url| thread| +------+--------------------+--------------------+--------------------+ |19-290|Allmänna rubriker...|http://www.familj...|[[70148929, Frivi...| |19-290|Allmänna rubriker...|http://www.familj...|[[58302374, Missh...| |19-290|Allmänna rubriker...|http://www.familj...|[[36330819, Är så...| |19-290|Allmänna rubriker...|http://www.familj...|[[75852809, Hur k...| |19-290|Allmänna rubriker...|http://www.familj...|[[42304381, Hej a...| |19-290|Allmänna rubriker...|http://www.familj...|[[41294375, när p...| |19-295|Allmänna rubriker...|http://www.familj...|[[47653437, Har v...| |19-295|Allmänna rubriker...|http://www.familj...|[[75266317, Anmäl...| |19-295|Allmänna rubriker...|http://www.familj...|[[76559817, Fel a...| |19-295|Allmänna rubriker...|http://www.familj...|[[62028128, Vad g...| +------+--------------------+--------------------+--------------------+ only showing top 10 rows
df.printSchema()
root |-- thread_id: string (nullable = true) |-- thread_title: string (nullable = true) |-- w: string (nullable = false) |-- forum_id: string (nullable = true) |-- forum_title: string (nullable = true) |-- platform: string (nullable = false) |-- corpus_id: string (nullable = false)
display(df)