Some of Types Cannot Be Determined by the First 100 Rows
Python pyspark.sql.Row() Examples
The post-obit are 30 code examples for showing how to utilise pyspark.sql.Row() . These examples are extracted from open up source projects. You can vote upwards the ones you lot similar or vote down the ones you lot don't like, and go to the original project or source file by following the links to a higher place each example.
You may bank check out the related API usage on the sidebar.
You lot may also want to cheque out all bachelor functions/classes of the module pyspark.sql , or try the search function .
Example i
def _build_local_features(np_dtype): """ Build numpy array (i.e. local) features. """ # Build local features and DataFrame from it local_features = [] np.random.seed(997) for idx in range(100): _dict = {'idx': idx} for colname, _ in _input_mapping.items(): colvalue = np.random.randn(_tensor_size) * 100 _dict[colname] = colvalue.astype(np_dtype).tolist() local_features.append(Row(**_dict)) return local_features
Case 2
def test_map_rows_sql_1(self): information = [Row(ten=float(x)) for ten in range(5)] df = self.sql.createDataFrame(data) with IsolatedSession() as issn: # The placeholder that corresponds to cavalcade 'x' as a whole column x = tf.placeholder(tf.double, shape=[], name="ten") # The output that adds three to ten z = tf.add(x, 3, name='z') # Let's register these computations in SQL. makeGraphUDF(issn.graph, "map_rows_sql_1", [z]) # Hither we get, for the SQL users, straight from PySpark. df2 = df.selectExpr("map_rows_sql_1(ten) AS z") print("df2 = %s" % df2) data2 = df2.collect() assert data2[0].z == 3.0, data2
Example 3
def test_map_blocks_sql_1(self): data = [Row(x=float(x)) for x in range(5)] df = self.sql.createDataFrame(data) with IsolatedSession() as issn: # The placeholder that corresponds to column 'x' equally a whole cavalcade x = tf.placeholder(tf.double, shape=[None], name="ten") # The output that adds iii to x z = tf.add(x, 3, proper noun='z') # Let's register these computations in SQL. makeGraphUDF(issn.graph, "map_blocks_sql_1", [z], blocked=True) # Hither we become, for the SQL users, straight from PySpark. df2 = df.selectExpr("map_blocks_sql_1(x) Every bit z") print("df2 = %southward" % df2) data2 = df2.collect() assert len(data2) == 5, data2 assert data2[0].z == three.0, data2
Case 4
def _monkey_patch_RDD(sparkSession): def toDF(self, schema=None, sampleRatio=None): """ Converts current :grade:`RDD` into a :class:`DataFrame` This is a autograph for ``spark.createDataFrame(rdd, schema, sampleRatio)`` :param schema: a :class:`pyspark.sql.types.StructType` or listing of names of columns :param samplingRatio: the sample ratio of rows used for inferring :return: a DataFrame >>> rdd.toDF().collect() [Row(proper name=u'Alice', age=1)] """ return sparkSession.createDataFrame(self, schema, sampleRatio) RDD.toDF = toDF
Example 5
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row or tuple. :param data: list of Row or tuple :param names: list of column names :return: :grade:`pyspark.sql.types.StructType` """ if not data: heighten ValueError("can not infer schema from empty dataset") starting time = information[0] if blazon(kickoff) is dict: warnings.warn("inferring schema from dict is deprecated," "please utilize pyspark.sql.Row instead") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot be adamant after inferring") return schema
Case 6
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.re-create() sc = SparkContext('local[four]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=one, field2="row1"), Row(field1=two, field2="row2"), Row(field1=iii, field2="row3")]) globs['df'] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].stop() if failure_count: sys.go out(-ane)
Case vii
def _monkey_patch_RDD(sparkSession): def toDF(self, schema=None, sampleRatio=None): """ Converts current :class:`RDD` into a :class:`DataFrame` This is a shorthand for ``spark.createDataFrame(rdd, schema, sampleRatio)`` :param schema: a :course:`pyspark.sql.types.StructType` or list of names of columns :param samplingRatio: the sample ratio of rows used for inferring :render: a DataFrame >>> rdd.toDF().collect() [Row(name=u'Alice', historic period=1)] """ render sparkSession.createDataFrame(cocky, schema, sampleRatio) RDD.toDF = toDF
Example eight
def _inferSchemaFromList(self, data, names=None): """ Infer schema from list of Row or tuple. :param information: listing of Row or tuple :param names: list of column names :return: :class:`pyspark.sql.types.StructType` """ if not data: raise ValueError("can non infer schema from empty dataset") first = data[0] if type(first) is dict: warnings.warn("inferring schema from dict is deprecated," "please use pyspark.sql.Row instead") schema = reduce(_merge_type, (_infer_schema(row, names) for row in data)) if _has_nulltype(schema): raise ValueError("Some of types cannot exist adamant after inferring") render schema
Example 9
def _test(): import os import doctest from pyspark.context import SparkContext from pyspark.sql import Row import pyspark.sql.session os.chdir(os.environ["SPARK_HOME"]) globs = pyspark.sql.session.__dict__.copy() sc = SparkContext('local[4]', 'PythonTest') globs['sc'] = sc globs['spark'] = SparkSession(sc) globs['rdd'] = rdd = sc.parallelize( [Row(field1=1, field2="row1"), Row(field1=2, field2="row2"), Row(field1=iii, field2="row3")]) globs['df'] = rdd.toDF() (failure_count, test_count) = doctest.testmod( pyspark.sql.session, globs=globs, optionflags=doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE) globs['sc'].terminate() if failure_count: exit(-one)
Example 10
def convert_svmrank_to_xgboost(df: DataFrame) -> DataFrame: def convert_one(row: Row) -> Row: # For now place the .xgb right next to the svmrank files. Naming/path # options could be added if needed later. out_path = row.path + '.xgb' _convert_xgboost_remote(row.path, out_path) return Row(**dict( row.asDict(), vec_format='xgboost', path=out_path)) # Each row represents potentially gigabytes, convince spark # to create a partition per row. rdd_xgb = mt.partition_per_row(df.rdd).map(convert_one) df_xgb = df.sql_ctx.createDataFrame(rdd_xgb, df.schema) # type: ignore # Return both the xgb and svmrank datasets since # nosotros aren't purging the related files. df is safe to reuse since # svmrank conversion returns a new dataframe with no lineage. render df.wedlock(df_xgb)
Example xi
def test_lf_applier_spark_preprocessor_memoized(self) -> None: sc = SparkContext.getOrCreate() sql = SQLContext(sc) @preprocessor(memoize=True) def square_memoize(10: DataPoint) -> DataPoint: return Row(num=x.num, num_squared=x.num ** 2) @labeling_function(pre=[square_memoize]) def fp_memoized(x: DataPoint) -> int: return 0 if x.num_squared > 42 else -i df = pd.DataFrame(dict(num=DATA)) rdd = sql.createDataFrame(df).rdd applier = SparkLFApplier([f, fp_memoized]) Fifty = applier.utilise(rdd) np.testing.assert_equal(L, L_PREPROCESS_EXPECTED)
Example 12
def test_decorator_mapper_memoized_none(self) -> None: square_hit_tracker = SquareHitTracker() @lambda_mapper(memoize=Truthful) def square(10: DataPoint) -> DataPoint: fields = x.asDict() fields["num_squared"] = square_hit_tracker(ten.num) if 10.num == 21: return None return Row(**fields) x21 = cocky._get_x(21) x21_mapped = square(x21) self.assertIsNone(x21_mapped) self.assertEqual(square_hit_tracker.n_hits, one) x21_mapped = square(x21) self.assertIsNone(x21_mapped) self.assertEqual(square_hit_tracker.n_hits, 1)
Example 13
def test_string_indexer_handle_invalid(self): df = self.spark.createDataFrame([ (0, "a"), (one, "d"), (two, None)], ["id", "label"]) si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep", stringOrderType="alphabetAsc") model1 = si1.fit(df) td1 = model1.transform(df) actual1 = td1.select("id", "indexed").collect() expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=i.0), Row(id=2, indexed=two.0)] self.assertEqual(actual1, expected1) si2 = si1.setHandleInvalid("skip") model2 = si2.fit(df) td2 = model2.transform(df) actual2 = td2.select("id", "indexed").collect() expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)] cocky.assertEqual(actual2, expected2)
Instance 14
def test_infer_schema(self): rdd = self.sc.parallelize([Row(characterization=1.0, features=self.dv1), Row(label=0.0, features=self.sv1)]) df = rdd.toDF() schema = df.schema field = [f for f in schema.fields if f.name == "features"][0] self.assertEqual(field.dataType, self.udt) vectors = df.rdd.map(lambda p: p.features).collect() self.assertEqual(len(vectors), two) for v in vectors: if isinstance(v, SparseVector): self.assertEqual(v, self.sv1) elif isinstance(v, DenseVector): self.assertEqual(5, self.dv1) else: raise TypeError("expecting a vector just got %r of type %r" % (five, type(5)))
Case fifteen
def approx_count_distinct(col, rsd=None): """Aggregate role: returns a new :class:`Column` for approximate singled-out count of column `col`. :param rsd: maximum estimation error immune (default = 0.05). For rsd < 0.01, information technology is more efficient to use :func:`countDistinct` >>> df.agg(approx_count_distinct(df.age).alias('distinct_ages')).collect() [Row(distinct_ages=two)] """ sc = SparkContext._active_spark_context if rsd is None: jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col)) else: jc = sc._jvm.functions.approx_count_distinct(_to_java_column(col), rsd) return Column(jc)
Example 16
def monotonically_increasing_id(): """A column that generates monotonically increasing 64-bit integers. The generated ID is guaranteed to be monotonically increasing and unique, only not consecutive. The current implementation puts the sectionalisation ID in the upper 31 bits, and the record number within each partition in the lower 33 bits. The assumption is that the information frame has less than 1 billion partitions, and each sectionalization has less than 8 billion records. .. notation:: The function is non-deterministic considering its result depends on partition IDs. As an case, consider a :form:`DataFrame` with ii partitions, each with iii records. This expression would render the following IDs: 0, ane, two, 8589934592 (1L << 33), 8589934593, 8589934594. >>> df0 = sc.parallelize(range(2), two).mapPartitions(lambda x: [(i,), (2,), (3,)]).toDF(['col1']) >>> df0.select(monotonically_increasing_id().allonym('id')).collect() [Row(id=0), Row(id=1), Row(id=2), Row(id=8589934592), Row(id=8589934593), Row(id=8589934594)] """ sc = SparkContext._active_spark_context return Column(sc._jvm.functions.monotonically_increasing_id())
Example 17
def randn(seed=None): """Generates a cavalcade with independent and identically distributed (i.i.d.) samples from the standard normal distribution. .. note:: The function is non-deterministic in general case. >>> df.withColumn('randn', randn(seed=42)).collect() [Row(historic period=2, name=u'Alice', randn=-0.7556247885860078), Row(age=5, name=u'Bob', randn=-0.0861619008451133)] """ sc = SparkContext._active_spark_context if seed is non None: jc = sc._jvm.functions.randn(seed) else: jc = sc._jvm.functions.randn() render Column(jc)
Case 18
def date_format(date, format): """ Converts a appointment/timestamp/cord to a value of cord in the format specified past the engagement format given by the second statement. A pattern could be for example `dd.MM.yyyy` and could render a string like 'xviii.03.1993'. All blueprint letters of the Java class `coffee.text.SimpleDateFormat` can be used. .. notation:: Utilise when always possible specialized functions similar `year`. These do good from a specialized implementation. >>> df = spark.createDataFrame([('2015-04-08',)], ['dt']) >>> df.select(date_format('dt', 'MM/dd/yyy').alias('appointment')).collect() [Row(date=u'04/08/2015')] """ sc = SparkContext._active_spark_context return Cavalcade(sc._jvm.functions.date_format(_to_java_column(appointment), format))
Instance 19
def predict_func(rows, graph_json, prediction, graph_weights, inp, activation, tf_input, tf_dropout=None, to_keep_dropout=False): rows = [r.asDict() for r in rows] if len(rows) > 0: graph = tf.MetaGraphDef() graph = json_format.Parse(graph_json, graph) loaded_weights = json.loads(graph_weights) loaded_weights = [np.asarray(x) for ten in loaded_weights] A = [np.asarray(row[inp]) for row in rows] new_graph = tf.Graph() with tf.Session(graph=new_graph) as sess: tf.railroad train.import_meta_graph(graph) sess.run(tf.global_variables_initializer()) tensorflow_set_weights(loaded_weights) out_node = tf.get_default_graph().get_tensor_by_name(activation) dropout_v = one.0 if tf_dropout is non None and to_keep_dropout else 0.0 feed_dict = {tf_input: A} if tf_dropout is None else {tf_input: A, tf_dropout: dropout_v} pred = sess.run(out_node, feed_dict=feed_dict) for i in range(0, len(rows)): row = rows[i] try: # Vectors Dense are handled differently in python 3 internal = bladder(pred[i]) row[prediction] = internal except: row[prediction] = Vectors.dense(pred[i]) return [Row(**a) for a in rows] render []
Example 20
def __init__(cocky, sparkContext, jsparkSession=None): """Creates a new SparkSession. >>> from datetime import datetime >>> spark = SparkSession(sc) >>> allTypes = sc.parallelize([Row(i=one, southward="cord", d=ane.0, l=ane, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=ane), ... fourth dimension=datetime(2014, 8, 1, 14, 1, 5))]) >>> df = allTypes.toDF() >>> df.createOrReplaceTempView("allTypes") >>> spark.sql('select i+1, d+one, not b, list[one], dict["due south"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row((i + CAST(1 Equally BIGINT))=ii, (d + CAST(ane Equally DOUBLE))=2.0, (NOT b)=False, listing[1]=two, \ dict[s]=0, time=datetime.datetime(2014, eight, 1, 14, i, v), a=1)] >>> df.rdd.map(lambda x: (x.i, x.s, ten.d, ten.l, ten.b, x.time, x.row.a, x.list)).collect() [(1, u'string', 1.0, one, True, datetime.datetime(2014, 8, 1, 14, 1, 5), 1, [1, 2, 3])] """ from pyspark.sql.context import SQLContext cocky._sc = sparkContext cocky._jsc = self._sc._jsc self._jvm = cocky._sc._jvm if jsparkSession is None: if self._jvm.SparkSession.getDefaultSession().isDefined() \ and not self._jvm.SparkSession.getDefaultSession().get() \ .sparkContext().isStopped(): jsparkSession = cocky._jvm.SparkSession.getDefaultSession().get() else: jsparkSession = cocky._jvm.SparkSession.builder().getOrCreate() # jsparkSession = self._jvm.SparkSession(self._jsc.sc()) self._jsparkSession = jsparkSession self._jwrapped = cocky._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, self._jwrapped) _monkey_patch_RDD(self) install_exception_handler() # If we had an instantiated SparkSession attached with a SparkContext # which is stopped now, nosotros need to renew the instantiated SparkSession. # Otherwise, we will use invalid SparkSession when we call Builder.getOrCreate. if SparkSession._instantiatedSession is None \ or SparkSession._instantiatedSession._sc._jsc is None: SparkSession._instantiatedSession = self self._jvm.SparkSession.setDefaultSession(cocky._jsparkSession)
Example 21
def range(self, start, end=None, footstep=i, numPartitions=None): """ Create a :class:`DataFrame` with single :form:`pyspark.sql.types.LongType` column named ``id``, containing elements in a range from ``starting time`` to ``terminate`` (exclusive) with step value ``step``. :param start: the start value :param terminate: the end value (exclusive) :param step: the incremental step (default: 1) :param numPartitions: the number of partitions of the DataFrame :render: :class:`DataFrame` >>> spark.range(one, 7, 2).collect() [Row(id=ane), Row(id=3), Row(id=v)] If simply one argument is specified, information technology will exist used every bit the end value. >>> spark.range(3).collect() [Row(id=0), Row(id=1), Row(id=2)] """ if numPartitions is None: numPartitions = self._sc.defaultParallelism if finish is None: jdf = self._jsparkSession.range(0, int(get-go), int(step), int(numPartitions)) else: jdf = self._jsparkSession.range(int(start), int(end), int(pace), int(numPartitions)) return DataFrame(jdf, self._wrapped)
Example 22
def _inferSchema(self, rdd, samplingRatio=None, names=None): """ Infer schema from an RDD of Row or tuple. :param rdd: an RDD of Row or tuple :param samplingRatio: sampling ratio, or no sampling (default) :return: :class:`pyspark.sql.types.StructType` """ outset = rdd.first() if not first: heighten ValueError("The offset row in RDD is empty, " "can not infer schema") if type(first) is dict: warnings.warn("Using RDD of dict to inferSchema is deprecated. " "Use pyspark.sql.Row instead") if samplingRatio is None: schema = _infer_schema(first, names=names) if _has_nulltype(schema): for row in rdd.have(100)[1:]: schema = _merge_type(schema, _infer_schema(row, names=names)) if not _has_nulltype(schema): break else: raise ValueError("Some of types cannot be determined by the " "showtime 100 rows, please effort again with sampling") else: if samplingRatio < 0.99: rdd = rdd.sample(False, float(samplingRatio)) schema = rdd.map(lambda row: _infer_schema(row, names)).reduce(_merge_type) return schema
Example 23
def __init__(cocky, sparkContext, jsparkSession=None): """Creates a new SparkSession. >>> from datetime import datetime >>> spark = SparkSession(sc) >>> allTypes = sc.parallelize([Row(i=1, due south="string", d=i.0, 50=1, ... b=True, list=[1, 2, 3], dict={"s": 0}, row=Row(a=ane), ... time=datetime(2014, 8, 1, fourteen, 1, 5))]) >>> df = allTypes.toDF() >>> df.createOrReplaceTempView("allTypes") >>> spark.sql('select i+1, d+1, not b, listing[1], dict["due south"], time, row.a ' ... 'from allTypes where b and i > 0').collect() [Row((i + CAST(1 As BIGINT))=2, (d + CAST(1 As DOUBLE))=ii.0, (NOT b)=Imitation, list[1]=2, \ dict[s]=0, time=datetime.datetime(2014, 8, 1, 14, 1, 5), a=one)] >>> df.rdd.map(lambda ten: (x.i, x.south, x.d, x.50, x.b, x.time, x.row.a, x.listing)).collect() [(1, u'string', 1.0, 1, True, datetime.datetime(2014, 8, i, 14, 1, 5), 1, [one, 2, iii])] """ from pyspark.sql.context import SQLContext self._sc = sparkContext self._jsc = self._sc._jsc self._jvm = self._sc._jvm if jsparkSession is None: jsparkSession = self._jvm.SparkSession.architect().getOrCreate() self._jsparkSession = jsparkSession cocky._jwrapped = self._jsparkSession.sqlContext() self._wrapped = SQLContext(self._sc, self, cocky._jwrapped) _monkey_patch_RDD(cocky) install_exception_handler() # If we had an instantiated SparkSession fastened with a SparkContext # which is stopped now, we need to renew the instantiated SparkSession. # Otherwise, we will utilize invalid SparkSession when we call Builder.getOrCreate. if SparkSession._instantiatedSession is None \ or SparkSession._instantiatedSession._sc._jsc is None: SparkSession._instantiatedSession = cocky
Instance 24
def range(self, first, end=None, footstep=1, numPartitions=None): """ Create a :class:`DataFrame` with single :course:`pyspark.sql.types.LongType` column named ``id``, containing elements in a range from ``beginning`` to ``finish`` (exclusive) with step value ``footstep``. :param commencement: the start value :param end: the end value (exclusive) :param step: the incremental step (default: ane) :param numPartitions: the number of partitions of the DataFrame :render: :grade:`DataFrame` >>> spark.range(one, seven, ii).collect() [Row(id=ane), Row(id=three), Row(id=5)] If only one argument is specified, it volition exist used as the end value. >>> spark.range(3).collect() [Row(id=0), Row(id=1), Row(id=ii)] """ if numPartitions is None: numPartitions = self._sc.defaultParallelism if terminate is None: jdf = cocky._jsparkSession.range(0, int(beginning), int(pace), int(numPartitions)) else: jdf = cocky._jsparkSession.range(int(showtime), int(end), int(footstep), int(numPartitions)) render DataFrame(jdf, cocky._wrapped)
Case 25
def sql(cocky, sqlQuery): """Returns a :class:`DataFrame` representing the result of the given query. :return: :course:`DataFrame` >>> df.createOrReplaceTempView("table1") >>> df2 = spark.sql("SELECT field1 AS f1, field2 as f2 from table1") >>> df2.collect() [Row(f1=i, f2=u'row1'), Row(f1=2, f2=u'row2'), Row(f1=3, f2=u'row3')] """ render DataFrame(self._jsparkSession.sql(sqlQuery), cocky._wrapped)
Example 26
def convert_sparse_to_dataframe(spark, context, sparse_matrix): """ Converts a scipy sparse matrix to a spark dataframe """ m = sparse_matrix.tocoo() data = context.parallelize(numpy.array([yard.row, yard.col, m.information]).T, numSlices=len(m.row)/1024) return spark.createDataFrame(data.map(lambda p: Row(row=int(p[0]), col=int(p[1]), data=float(p[2]))))
Example 27
def decide_best_params(all_params: Listing[Row]) -> Row: # Input rows friction match mt.ModelParameters best_loss_idx = np.argmin(row['loss'] for row in all_params) initial_best_params = all_params[best_loss_idx] # TODO: Something better than direct argmin, probable there are several # quite similar sets of params that could be chosen between. render initial_best_params
Example 28
def simplify_datawriter_paths(j_paths: Whatever) -> List[Row]: rows = [] for fold_id, j_fold in enumerate(j_paths): for split_name, file_path in dict(j_fold).items(): rows.suspend(Row( vec_format='svmrank', split_name=split_name, path=file_path, fold_id=fold_id)) render rows # wikiid doesn't be at this level, the dataframes already # represent a unmarried wiki and that is added back in later.
Example 29
def test_dataframe_no_schema(dataset, spark): df = dataset.dataframe(spark, decode=decode) assert blazon(df) == DataFrame assert df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
Example 30
def test_dataframe_with_schema(dataset, spark): schema = StructType([StructField("foo", IntegerType(), Truthful)]) df = dataset.dataframe(spark, decode=decode, schema=schema, table_name='bar') assert type(df) == DataFrame assert df.columns == ['foo'] affirm df.orderBy(["foo"]).collect() == [Row(foo=1), Row(foo=2)]
Source: https://www.programcreek.com/python/example/100657/pyspark.sql.Row
0 Response to "Some of Types Cannot Be Determined by the First 100 Rows"
Post a Comment