val entries: RDD[MatrixEntry] = sc.parallelize(Array(MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7))) // an RDD of matrix entries
entries: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = ParallelCollectionRDD[30134] at parallelize at command-1805207615647108:1
%py from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry # Create an RDD of coordinate entries. # - This can be done explicitly with the MatrixEntry class: entries = sc.parallelize([MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7)]) # - or using (long, long, float) tuples: entries = sc.parallelize([(0, 0, 1.2), (1, 0, 2.1), (2, 1, 3.7)]) # Create an CoordinateMatrix from an RDD of MatrixEntries. mat = CoordinateMatrix(entries) # Get its size. m = mat.numRows() # 3 n = mat.numCols() # 2 print (m,n) # Get the entries as an RDD of MatrixEntries. entriesRDD = mat.entries # Convert to a RowMatrix. rowMat = mat.toRowMatrix() # Convert to an IndexedRowMatrix. indexedRowMat = mat.toIndexedRowMatrix() # Convert to a BlockMatrix. blockMat = mat.toBlockMatrix()
(3L, 2L)
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never