このような結果がほしいとします:
Species | Sepal.Length | Sepal.Width |
---|---|---|
versicolor | 5.936 | 2.770 |
virginica | 6.588 | 2.974 |
setosa | 5.006 | 3.428 |
iris %>%
dplyr::select(starts_with("sepal"),Species) %>%
dplyr::group_by(Species) %>%
dplyr::summarize_each(funs(mean)) %>%
dplyr::arrange(Sepal.Width)
## # A tibble: 3 × 3
## Species Sepal.Length Sepal.Width
## <fctr> <dbl> <dbl>
## 1 versicolor 5.936 2.770
## 2 virginica 6.588 2.974
## 3 setosa 5.006 3.428
iris %>%
dplyr::select(starts_with("sepal"),Species) %>%
dplyr::group_by(Species) %>%
dplyr::summarize_each(funs(mean)) %>%
dplyr::arrange(Sepal.Width)
irisデータを…
"sepal"で始まる変数と"Species"を取り出して…
"Species"の値でグループ化して…
各列に対して"平均"で要約して…
"Sepal.Width"の昇順でソート
このような結果がほしいとします:
var_name | value |
---|---|
Sepal.Length | 5.1 |
Sepal.Length | 4.9 |
Sepal.Length | 4.7 |
Sepal.Length | 4.6 |
Sepal.Length | 5.0 |
Sepal.Length | 5.4 |
iris %>%
dplyr::select(-Species) %>%
tidyr::gather(key = var_name, value = value) %>%
head
## var_name value
## 1 Sepal.Length 5.1
## 2 Sepal.Length 4.9
## 3 Sepal.Length 4.7
## 4 Sepal.Length 4.6
## 5 Sepal.Length 5.0
## 6 Sepal.Length 5.4
iris %>%
dplyr::select(-Species) %>%
tidyr::gather(key = var_name, value = value) %>%
head
irisデータを…
"Species"以外の変数を取り出して…
4つの変数を縦型のデータに整形して…
上6つのデータを表示
# 使わない書き方
df <- subset(iris, Species == "setosa")
df2 <- subset(df, select = c(Sepal.Length, Sepal.Width))
head(df2, 2)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
# 使った書き方
(df <- subset(iris, Species == "setosa") %>%
subset(select = c(Sepal.Length, Sepal.Width)) %>%
head(2))
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
必要なパッケージをインストール&読み込み
install.packages("dplyr")
install.packages("tidyr")
library(dplyr)
library(tidyr)
sessionInfo()
## R version 3.3.1 (2016-06-21)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Ubuntu 14.04.4 LTS
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] ggplot2_2.1.0.9000 knitr_1.14 tidyr_0.6.0
## [4] dplyr_0.5.0 rmarkdown_1.0.9016 stringr_1.1.0
##
## loaded via a namespace (and not attached):
## [1] Rcpp_0.12.7 magrittr_1.5 munsell_0.4.3 colorspace_1.2-6
## [5] R6_2.2.0 highr_0.6 plyr_1.8.4 tools_3.3.1
## [9] revealjs_0.7 grid_3.3.1 gtable_0.2.0 DBI_0.5-1
## [13] htmltools_0.3.5 yaml_2.1.13 lazyeval_0.2.0 assertthat_0.1
## [17] digest_0.6.10 tibble_1.2 formatR_1.4 htmlwidgets_0.7
## [21] rsconnect_0.4.4 evaluate_0.10 labeling_0.3 stringi_1.1.2
## [25] scales_0.4.0 jsonlite_1.1
df <- dplyr::select(iris, c(Sepal.Width,Species))
head(df, 3)
## Sepal.Width Species
## 1 3.5 setosa
## 2 3.0 setosa
## 3 3.2 setosa
df <- dplyr::select(iris, starts_with("sepal"))
head(df, 3)
## Sepal.Length Sepal.Width
## 1 5.1 3.5
## 2 4.9 3.0
## 3 4.7 3.2
ignore.case = FALSE
にすると、大文字と小文字を区別してくれるdf <- dplyr::select(iris, ends_with("width"))
head(df, 3)
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
df <- dplyr::select(iris, contains("pe"))
head(df, 3)
## Petal.Length Petal.Width Species
## 1 1.4 0.2 setosa
## 2 1.4 0.2 setosa
## 3 1.3 0.2 setosa
df <- dplyr::select(iris, matches(".t."))
head(df, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
df <- as.data.frame(matrix(1:30, nrow = 3, ncol = 10))
colnames(df) <- c(paste0("beer", 1:5), paste0("sake0", 1:5))
ls(df)
## [1] "beer1" "beer2" "beer3" "beer4" "beer5" "sake01" "sake02"
## [8] "sake03" "sake04" "sake05"
dplyr::select(df, num_range("beer", 1:3, 1))
## beer1 beer2 beer3
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
dplyr::select(df, num_range("sake", 2:4, 2))
## sake02 sake03 sake04
## 1 19 22 25
## 2 20 23 26
## 3 21 24 27
vname <- c("Petal.Length", "Sepal.Width")
df <- dplyr::select(iris, one_of(vname))
head(df, 3)
## Petal.Length Sepal.Width
## 1 1.4 3.5
## 2 1.4 3.0
## 3 1.3 3.2
df <- as.data.frame(matrix(1:15, nrow = 3, ncol = 5))
colnames(df) <- c("touyama", "hanazawa", "komatsu", "asumi", "sakura")
dplyr::select(df, everything())
## touyama hanazawa komatsu asumi sakura
## 1 1 4 7 10 13
## 2 2 5 8 11 14
## 3 3 6 9 12 15
dplyr::select(df, hanazawa, touyama, everything())
## hanazawa touyama komatsu asumi sakura
## 1 4 1 7 10 13
## 2 5 2 8 11 14
## 3 6 3 9 12 15
こういう使い方も。
head(dplyr::select(iris, kosaki=starts_with("Petal"), everything()))
## kosaki1 kosaki2 Sepal.Length Sepal.Width Species
## 1 1.4 0.2 5.1 3.5 setosa
## 2 1.4 0.2 4.9 3.0 setosa
## 3 1.3 0.2 4.7 3.2 setosa
## 4 1.5 0.2 4.6 3.1 setosa
## 5 1.4 0.2 5.0 3.6 setosa
## 6 1.7 0.4 5.4 3.9 setosa
df <- dplyr::mutate(iris, beer=Sepal.Width*2)
head(df, 2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species beer
## 1 5.1 3.5 1.4 0.2 setosa 7
## 2 4.9 3.0 1.4 0.2 setosa 6
df <- dplyr::mutate(df, beer=Sepal.Width*3)
head(df, 2)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species beer
## 1 5.1 3.5 1.4 0.2 setosa 10.5
## 2 4.9 3.0 1.4 0.2 setosa 9.0
df <- dplyr::mutate_each(iris, funs(. * 2), -Species)
kable(head(df,2))
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
10.2 | 7 | 2.8 | 0.4 | setosa |
9.8 | 6 | 2.8 | 0.4 | setosa |
df <- dplyr::filter(iris, Species=='virginica')
head(df, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 6.3 3.3 6.0 2.5 virginica
## 2 5.8 2.7 5.1 1.9 virginica
## 3 7.1 3.0 5.9 2.1 virginica
df <- dplyr::arrange(iris, Sepal.Length)
head(df, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 4.3 3.0 1.1 0.1 setosa
## 2 4.4 2.9 1.4 0.2 setosa
## 3 4.4 3.0 1.3 0.2 setosa
df <- dplyr::arrange(df, desc(Sepal.Length))
head(df, 3)
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 7.9 3.8 6.4 2.0 virginica
## 2 7.7 3.8 6.7 2.2 virginica
## 3 7.7 2.6 6.9 2.3 virginica
df <- dplyr::summarize(iris, varmean=mean(Sepal.Length))
df
## varmean
## 1 5.843333
df <- dplyr::summarize_each(iris, funs(min, mean, max, sd), ends_with("Length"))
df
## Sepal.Length_min Petal.Length_min Sepal.Length_mean Petal.Length_mean
## 1 4.3 1 5.843333 3.758
## Sepal.Length_max Petal.Length_max Sepal.Length_sd Petal.Length_sd
## 1 7.9 6.9 0.8280661 1.765298
df <- dplyr::group_by(iris, Species) %>%
dplyr::summarize(cmean=mean(Sepal.Length))
df
## # A tibble: 3 × 2
## Species cmean
## <fctr> <dbl>
## 1 setosa 5.006
## 2 versicolor 5.936
## 3 virginica 6.588
a <- data.frame(x1=c("A","B","C"),x2=1:3)
b <- data.frame(x1=c("A","B","D"),x3=c(TRUE, FALSE, TRUE))
y <- data.frame(x1=c("A","B","C"),x2=1:3)
z <- data.frame(x1=c("B","C","D"),x2=2:4)
df <- dplyr::full_join(a,b,by="x1")
## Warning in full_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
kable(df)
x1 | x2 | x3 |
---|---|---|
A | 1 | TRUE |
B | 2 | FALSE |
C | 3 | NA |
D | NA | TRUE |
NA
がはいりますdf <- dplyr::inner_join(a,b,by="x1")
## Warning in inner_join_impl(x, y, by$x, by$y, suffix$x, suffix$y): joining
## factors with different levels, coercing to character vector
kable(df)
x1 | x2 | x3 |
---|---|---|
A | 1 | TRUE |
B | 2 | FALSE |
df <- dplyr::bind_cols(y, z)
kable(df)
x1 | x2 | x1 | x2 |
---|---|---|---|
A | 1 | B | 2 |
B | 2 | C | 3 |
C | 3 | D | 4 |
df <- dplyr::bind_rows(y, z)
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
kable(df)
x1 | x2 |
---|---|
A | 1 |
B | 2 |
C | 3 |
B | 2 |
C | 3 |
D | 4 |
df <- dplyr::bind_rows(y, z, .id = "df_id")
## Warning in bind_rows_(x, .id): Unequal factor levels: coercing to character
kable(df)
df_id | x1 | x2 |
---|---|---|
1 | A | 1 |
1 | B | 2 |
1 | C | 3 |
2 | B | 2 |
2 | C | 3 |
2 | D | 4 |
.id=**
を指定すると、テーブルidを変数として作成してくれますdf <- tidyr::gather(data=iris, key = keykey, value = valuevalue, -Species)
kable(head(df, 4))
Species | keykey | valuevalue |
---|---|---|
setosa | Sepal.Length | 5.1 |
setosa | Sepal.Length | 4.9 |
setosa | Sepal.Length | 4.7 |
setosa | Sepal.Length | 4.6 |
# irisにID列を追加して、gatherでまとめている
df <- dplyr::mutate(iris, id=rownames(iris)) %>%
tidyr::gather(key = keykey, value = valuevalue, contains("l."))
knitr::kable(head(df,2))
Species | id | keykey | valuevalue |
---|---|---|---|
setosa | 1 | Sepal.Length | 5.1 |
setosa | 2 | Sepal.Length | 4.9 |
# ひっつけたけどspreadでバラします
df_2 <- tidyr::spread(df, key = keykey, value = valuevalue)
knitr::kable(head(df_2,2))
Species | id | Petal.Length | Petal.Width | Sepal.Length | Sepal.Width |
---|---|---|---|---|---|
setosa | 1 | 1.4 | 0.2 | 5.1 | 3.5 |
setosa | 10 | 1.5 | 0.1 | 4.9 | 3.1 |
df <- tidyr::unite(data = iris, col = colll, starts_with("Sepal"), sep = "-")
knitr::kable(head(df))
colll | Petal.Length | Petal.Width | Species |
---|---|---|---|
5.1-3.5 | 1.4 | 0.2 | setosa |
4.9-3 | 1.4 | 0.2 | setosa |
4.7-3.2 | 1.3 | 0.2 | setosa |
4.6-3.1 | 1.5 | 0.2 | setosa |
5-3.6 | 1.4 | 0.2 | setosa |
5.4-3.9 | 1.7 | 0.4 | setosa |
# 一旦ひっつけます(uniteのコードと同一)
df <- tidyr::unite(data = iris, col = colll, starts_with("Sepal"), sep = "-")
# separateを実行
df2 <- tidyr::separate(data = df, col = colll, into = c("Sepal.Length","Sepal.Width"), sep = "-")
knitr::kable(head(df2))
Sepal.Length | Sepal.Width | Petal.Length | Petal.Width | Species |
---|---|---|---|---|
5.1 | 3.5 | 1.4 | 0.2 | setosa |
4.9 | 3 | 1.4 | 0.2 | setosa |
4.7 | 3.2 | 1.3 | 0.2 | setosa |
4.6 | 3.1 | 1.5 | 0.2 | setosa |
5 | 3.6 | 1.4 | 0.2 | setosa |
5.4 | 3.9 | 1.7 | 0.4 | setosa |
https://www.rstudio.com/resources/cheatsheets/
https://www.rstudio.com/wp-content/uploads/2015/09/data-wrangling-japanese.pdf