val rows: RDD[IndexedRow] = sc.parallelize(Array(IndexedRow(2, Vectors.dense(1,3)), IndexedRow(4, Vectors.dense(4,5)))) // an RDD of indexed rows
rows: org.apache.spark.rdd.RDD[org.apache.spark.mllib.linalg.distributed.IndexedRow] = ParallelCollectionRDD[30166] at parallelize at command-1805207615647095:1
%py from pyspark.mllib.linalg.distributed import IndexedRow, IndexedRowMatrix # Create an RDD of indexed rows. # - This can be done explicitly with the IndexedRow class: indexedRows = sc.parallelize([IndexedRow(0, [1, 2, 3]), IndexedRow(1, [4, 5, 6]), IndexedRow(2, [7, 8, 9]), IndexedRow(3, [10, 11, 12])]) # - or by using (long, vector) tuples: indexedRows = sc.parallelize([(0, [1, 2, 3]), (1, [4, 5, 6]), (2, [7, 8, 9]), (3, [10, 11, 12])]) # Create an IndexedRowMatrix from an RDD of IndexedRows. mat = IndexedRowMatrix(indexedRows) # Get its size. m = mat.numRows() # 4 n = mat.numCols() # 3 print (m,n) # Get the rows as an RDD of IndexedRows. rowsRDD = mat.rows # Convert to a RowMatrix by dropping the row indices. rowMat = mat.toRowMatrix() # Convert to a CoordinateMatrix. coordinateMat = mat.toCoordinateMatrix() # Convert to a BlockMatrix. blockMat = mat.toBlockMatrix()
(4L, 3L)
SDS-2.x, Scalable Data Engineering Science
Last refresh: Never