val entries: RDD[MatrixEntry] = sc.parallelize(Array(MatrixEntry(0, 0, 1.2), MatrixEntry(1, 0, 2.1), MatrixEntry(6, 1, 3.7))) // an RDD of matrix entries
entries: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.distributed.MatrixEntry] = ParallelCollectionRDD[30195] at parallelize at command-1805207615647121:1
%py from pyspark.mllib.linalg import Matrices from pyspark.mllib.linalg.distributed import BlockMatrix # Create an RDD of sub-matrix blocks. blocks = sc.parallelize([((0, 0), Matrices.dense(3, 2, [1, 2, 3, 4, 5, 6])), ((1, 0), Matrices.dense(3, 2, [7, 8, 9, 10, 11, 12]))]) # Create a BlockMatrix from an RDD of sub-matrix blocks. mat = BlockMatrix(blocks, 3, 2) # Get its size. m = mat.numRows() # 6 n = mat.numCols() # 2 print (m,n) # Get the blocks as an RDD of sub-matrix blocks. blocksRDD = mat.blocks # Convert to a LocalMatrix. localMat = mat.toLocalMatrix() # Convert to an IndexedRowMatrix. indexedRowMat = mat.toIndexedRowMatrix() # Convert to a CoordinateMatrix. coordinateMat = mat.toCoordinateMatrix()
(6L, 2L)
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never