-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfind_primes.py
56 lines (42 loc) · 1.26 KB
/
find_primes.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
from pyspark.sql import SparkSession
import argparse
def parse_args():
parser = argparse.ArgumentParser(
description="Find prime numbers in a dataset using PySpark"
)
parser.add_argument(
"--input_dir", type=str, required=True, help="Input data directory"
)
parser.add_argument(
"--output_dir",
type=str,
required=True,
help="Output directory for prime numbers",
)
return parser.parse_args()
def is_prime(num):
if num <= 1:
return False
if num == 2:
return True
if num % 2 == 0:
return False
for d in range(3, int(num**0.5) + 1, 2):
if num % d == 0:
return False
return True
if __name__ == "__main__":
args = parse_args()
spark = (
SparkSession.builder.master("local[*]")
.appName("PrimeNumberFinder")
.getOrCreate()
)
sc = spark.sparkContext
data = sc.textFile(args.input_dir)
primes_rdd = data.map(int).filter(is_prime).distinct()
primes_df = primes_rdd.map(lambda x: (x,)).toDF(["prime"])
sorted_primes_df = primes_df.orderBy("prime")
sorted_primes_df.write.mode("overwrite").csv(args.output_dir)
print(f"Prime numbers written to {args.output_dir}")
spark.stop()