1>>> n_by_state = df.groupby("state")["state"].count()
2>>> n_by_state.head(10)
3state
4AK 16
5AL 206
6AR 117
7AS 2
8AZ 48
9CA 361
10CO 90
11CT 240
12DC 2
13DE 97
14Name: last_name, dtype: int64
15
1>>> df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
2... 'Parrot', 'Parrot'],
3... 'Max Speed': [380., 370., 24., 26.]})
4>>> df
5 Animal Max Speed
60 Falcon 380.0
71 Falcon 370.0
82 Parrot 24.0
93 Parrot 26.0
10>>> df.groupby(['Animal']).mean()
11 Max Speed
12Animal
13Falcon 375.0
14Parrot 25.0
15
1# Groups the DataFrame using the specified columns
2
3df.groupBy().avg().collect()
4# [Row(avg(age)=3.5)]
5sorted(df.groupBy('name').agg({'age': 'mean'}).collect())
6# [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
7sorted(df.groupBy(df.name).avg().collect())
8# [Row(name='Alice', avg(age)=2.0), Row(name='Bob', avg(age)=5.0)]
9sorted(df.groupBy(['name', df.age]).count().collect())
10# [Row(name='Alice', age=2, count=1), Row(name='Bob', age=5, count=1)]
1In [1]: df = pd.DataFrame( {'a':['A','A','B','B','B','C'], 'b':[1,2,5,5,4,6]})
2 df
3
4Out[1]:
5 a b
60 A 1
71 A 2
82 B 5
93 B 5
104 B 4
115 C 6
12
13In [2]: df.groupby('a')['b'].apply(list)
14Out[2]:
15a
16A [1, 2]
17B [5, 5, 4]
18C [6]
19Name: b, dtype: object
20
21In [3]: df1 = df.groupby('a')['b'].apply(list).reset_index(name='new')
22 df1
23Out[3]:
24 a new
250 A [1, 2]
261 B [5, 5, 4]
272 C [6]
28