1>>> n_by_state = df.groupby("state")["state"].count()
2>>> n_by_state.head(10)
3state
4AK 16
5AL 206
6AR 117
7AS 2
8AZ 48
9CA 361
10CO 90
11CT 240
12DC 2
13DE 97
14Name: last_name, dtype: int64
15
1# Groups the DataFrame using the specified columns
2
3df.groupBy().avg().collect()
4# [Row(avg(age)=3.5)]
5sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
6# [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
7sorted(df.groupBy(df.name).avg().collect())
8# [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
9sorted(df.groupBy(['name', df.age]).count().collect())
10# [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]