-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest-spark-py3.py
46 lines (35 loc) · 1.23 KB
/
test-spark-py3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# code by Afik Bar, from email 2021-06-04
import sys
import random
major = sys.version_info[0]
if major != 3:
raise Exception("Please use Python 3")
from pyspark.sql import SparkSession
from pyspark.mllib.random import RandomRDDs
def init_spark(app_name):
spark = SparkSession.builder.appName(app_name).getOrCreate()
sc = spark.sparkContext
return spark, sc
def main():
if len(sys.argv) == 2:
length = sys.argv[1]
else:
length = 10**3
print("26")
data0_1 = RandomRDDs.uniformVectorRDD(sc, length, 3) \
.map(lambda a : a.round(3).tolist()) \
.toDF()
print("30")
name = "random_data{}.parquet".format( 333 ) # random.randrange(200))
print("using name="+ name)
data0_1.write.parquet(name)
print("33")
read_df = spark.read.parquet(name)
# print(f"Read {read_df.count()} records. Should be {length} records.") # for python 3
print("Read {} records. Should be {} records.".format(read_df.count(),length) ) # for python 3
# print "Read %d records. Should be %d records." % (read_df.count(),length )
if __name__ == '__main__':
spark, sc = init_spark('demo222')
print ("before calling main")
main()
print ("after main returned")