1# Named accumulators are not available in Python yet
2# You can use broadcast variables to see you progress
3# Somehow
4
5# Get a spark context sc and read your data into and rdd (or DataFrame)
6rows = sc.textFile("/path/to/data/sample_data.txt")
7# broadcast an object defined like below
8processor = spark.sparkContext.broadcast(Processor())
9# Apply this lambda to your rdd
10mapped = rows.map(lambda x: processor.value.process(x))
11# Other transformations you need to do
12sc.stop()
13
14class Processor:
15 def process(self, content):
16 # either print some output to the screen
17 print(content)
18
19 # or write to a file
20 with open('results.txt', 'a') as f:
21 f.write(content + "\n")
22
23 # or call an API to store and count the requests
24 requests.get('https://some.external.api/' + content)
25
26 return self.other_processing(content)