加载示例数据
library(dplyr)
library(nycflights13)
head(flights)
数据筛选
0.1.1 筛选出 1 月 1 日的数据
filter(flights, month == 1, day == 1)
0.1.2 筛选出大于 6 月的数据
filter(flights, month > 6)
0.1.3 筛选出大于 6 月的数据
filter(flights,month == 1 | month == 2)
数据排序
0.1.1 按照年月日有限度进行降序重排
arrange(flights,year,month,day)
0.1.2 按照 arr_delay 字段进行降序重拍
arrange(flights, desc(arr_delay))
数据筛选并剔除
0.1.1 筛选前几行功能
select(flights, year, month, day)
select(flights, year:day)
0.1.2 按照列对重复行进行筛选
distinct(select(flights, origin, dest))
数据变形
0.1.1 生成了新的列变量
mutate(flights,gain = arr_delay - dep_delay, speed = distance/air_time * 60)
0.1.2 已生成变量
transform(flights,
arr_delay = month - day,
gain_per_hour = arr_delay / (month / 60))
0.1.3 只保留生成的行列
transmute(flights,
gain = arr_delay - dep_delay,
gain_per_hour = gain / (air_time / 60))
数据汇总
summarise(flights, delay = mean(dep_delay, na.rm = TRUE))
数据分组
by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)
其他
by_tailnum <- group_by(flights, tailnum)
delay <- summarise(by_tailnum,
count = n(),
dist = mean(distance, na.rm = TRUE),
delay = mean(arr_delay, na.rm = TRUE))
delay <- filter(delay, count > 20, dist < 2000)