Packages
library(magrittr)
library(readr)
library(tidyr)
library(dplyr)Defining tidy data (long-format)
- Each variable forms a column.
- Each observation forms a row.
- Each type of observational unit forms a table.
Tools for reshaping dataframe
| Packages | to long | to wide |
|---|---|---|
| tidyr | gather | spread |
| reshape2 | melt | dcast |
| pandas | melt | unstsack / pivot_table / pivot |
| spreadsheets | unpivot | pivot |
| databases | fold | unfold |
Example dataset
Wide data
| name | treatmenta | treatmentb |
|---|---|---|
| John Smith | NA | 18 |
| Jane Doe | 4 | 1 |
| Mary Johnson | 6 | 7 |
| treatment | John Smith | Jane Doe | Mary Johnson |
|---|---|---|---|
| a | NA | 4 | 6 |
| b | 18 | 1 | 7 |
Long data
| name | treatment | n |
|---|---|---|
| Jane Doe | a | 4 |
| Jane Doe | b | 1 |
| John Smith | a | NA |
| John Smith | b | 18 |
| Mary Johnson | a | 6 |
| Mary Johnson | b | 7 |
Tidying: Tidyr variables
起手式
- Specifying variables and values. (it depends)
gather(): Put all values in the cells.spread(): Put variables back to columns.
gather: to long
data %>% gather(key, value, ...cols_to_gather)
df_wide <- read_csv("http://bit.ly/country-year-wide")
df_wide
#> # A tibble: 3 x 3
#> country `1999` `2000`
#> <chr> <int> <int>
#> 1 Afghanistan 745 2666
#> 2 Brazil 37737 80488
#> 3 China 212258 213766
df_wide %>% gather(key = "year", value = "cases", -country)
#> # A tibble: 6 x 3
#> country year cases
#> <chr> <chr> <int>
#> 1 Afghanistan 1999 745
#> 2 Brazil 1999 37737
#> 3 China 1999 212258
#> 4 Afghanistan 2000 2666
#> 5 Brazil 2000 80488
#> 6 China 2000 213766spread: to wide
data %>% spread(key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)
df_long <- read_csv("http://bit.ly/cases-long")
df_long
#> # A tibble: 12 x 4
#> country year type count
#> <chr> <int> <chr> <int>
#> 1 Afghanistan 1999 cases 745
#> 2 Afghanistan 1999 population 19987071
#> 3 Afghanistan 2000 cases 2666
#> 4 Afghanistan 2000 population 20595360
#> 5 Brazil 1999 cases 37737
#> 6 Brazil 1999 population 172006362
#> 7 Brazil 2000 cases 80488
#> 8 Brazil 2000 population 174504898
#> 9 China 1999 cases 212258
#> 10 China 1999 population 1272915272
#> 11 China 2000 cases 213766
#> 12 China 2000 population 1280428583
df_long %>%
spread(key = type, value = count)
#> # A tibble: 6 x 4
#> country year cases population
#> <chr> <int> <int> <int>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583seperate: 分割一個 cell 多個值
data %>% seperate(col, into, sep)
容易發生在源頭是 Excel 的資料
df_to_sep <- read_csv("http://bit.ly/sep-raw")
df_to_sep
#> # A tibble: 6 x 3
#> country year rate
#> <chr> <int> <chr>
#> 1 Afghanistan 1999 745/19987071
#> 2 Afghanistan 2000 2666/20595360
#> 3 Brazil 1999 37737/172006362
#> 4 Brazil 2000 80488/174504898
#> 5 China 1999 212258/1272915272
#> 6 China 2000 213766/1280428583
df_to_sep %>%
separate(rate, into = c("cases", "pop"), sep = "/")
#> # A tibble: 6 x 4
#> country year cases pop
#> <chr> <int> <chr> <chr>
#> 1 Afghanistan 1999 745 19987071
#> 2 Afghanistan 2000 2666 20595360
#> 3 Brazil 1999 37737 172006362
#> 4 Brazil 2000 80488 174504898
#> 5 China 1999 212258 1272915272
#> 6 China 2000 213766 1280428583data %>% seperate_rows(sep) == seperate %>% gather
df_to_sep %>%
separate_rows(rate, sep = "/")
#> # A tibble: 12 x 3
#> country year rate
#> <chr> <int> <chr>
#> 1 Afghanistan 1999 745
#> 2 Afghanistan 1999 19987071
#> 3 Afghanistan 2000 2666
#> 4 Afghanistan 2000 20595360
#> 5 Brazil 1999 37737
#> 6 Brazil 1999 172006362
#> 7 Brazil 2000 80488
#> 8 Brazil 2000 174504898
#> 9 China 1999 212258
#> 10 China 1999 1272915272
#> 11 China 2000 213766
#> 12 China 2000 1280428583狀況1: 欄位名稱是值 (dataset: pew)
容易發生在源頭是 Excel 的資料
pew <- read_csv("http://bit.ly/pew-raw")| religion | <$10k | $10-20k | $20-30k | $30-40k | $40-50k | $50-75k | $75-100k | $100-150k | >150k | Don’t know/refused |
|---|---|---|---|---|---|---|---|---|---|---|
| Agnostic | 27 | 34 | 60 | 81 | 76 | 137 | 122 | 109 | 84 | 96 |
| Atheist | 12 | 27 | 37 | 52 | 35 | 70 | 73 | 59 | 74 | 76 |
| Buddhist | 27 | 21 | 30 | 34 | 33 | 58 | 62 | 39 | 53 | 54 |
| Catholic | 418 | 617 | 732 | 670 | 638 | 1116 | 949 | 792 | 633 | 1489 |
| Don’t know/refused | 15 | 14 | 15 | 11 | 10 | 35 | 21 | 17 | 18 | 116 |
| Evangelical Prot | 575 | 869 | 1064 | 982 | 881 | 1486 | 949 | 723 | 414 | 1529 |
3 variables:
- religion
- income
- frequency
?gather
pew %>%
gather(key = income, value = frequency, -religion)
#> # A tibble: 180 x 3
#> religion income frequency
#> <chr> <chr> <int>
#> 1 Agnostic <$10k 27
#> 2 Atheist <$10k 12
#> 3 Buddhist <$10k 27
#> 4 Catholic <$10k 418
#> 5 Don’t know/refused <$10k 15
#> 6 Evangelical Prot <$10k 575
#> 7 Hindu <$10k 1
#> 8 Historically Black Prot <$10k 228
#> 9 Jehovah's Witness <$10k 20
#> 10 Jewish <$10k 19
#> # ... with 170 more rows狀況2: 複合型欄位名 (dataset: tb)
tb <- read_csv("http://bit.ly/tb-raw")
tb
#> # A tibble: 5,769 x 22
#> iso2 year m04 m514 m014 m1524 m2534 m3544 m4554 m5564 m65 mu
#> <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#> 1 AD 1989 NA NA NA NA NA NA NA NA NA NA
#> 2 AD 1990 NA NA NA NA NA NA NA NA NA NA
#> 3 AD 1991 NA NA NA NA NA NA NA NA NA NA
#> 4 AD 1992 NA NA NA NA NA NA NA NA NA NA
#> 5 AD 1993 NA NA NA NA NA NA NA NA NA NA
#> 6 AD 1994 NA NA NA NA NA NA NA NA NA NA
#> 7 AD 1996 NA NA 0 0 0 4 1 0 0 NA
#> 8 AD 1997 NA NA 0 0 1 2 2 1 6 NA
#> 9 AD 1998 NA NA 0 0 0 1 0 0 0 NA
#> 10 AD 1999 NA NA 0 0 0 1 1 0 0 NA
#> # ... with 5,759 more rows, and 10 more variables: f04 <int>, f514 <int>,
#> # f014 <int>, f1524 <int>, f2534 <int>, f3544 <int>, f4554 <int>,
#> # f5564 <int>, f65 <int>, fu <int>
tb %>%
gather(key = "sex_and_age", value = "cases", m04:fu) %>%
drop_na(cases) %>%
separate(sex_and_age, into = c("sex", "age"), sep = 1)
#> # A tibble: 35,750 x 5
#> iso2 year sex age cases
#> <chr> <int> <chr> <chr> <int>
#> 1 AD 2005 m 04 0
#> 2 AD 2006 m 04 0
#> 3 AD 2008 m 04 0
#> 4 AE 2006 m 04 0
#> 5 AE 2007 m 04 0
#> 6 AE 2008 m 04 0
#> 7 AG 2007 m 04 0
#> 8 AL 2005 m 04 0
#> 9 AL 2006 m 04 1
#> 10 AL 2007 m 04 0
#> # ... with 35,740 more rows狀況3: 欄位散佈在欄和列 (dataset: weather)
weather <- read_csv("http://bit.ly/weather-raw")
weather
#> # A tibble: 22 x 35
#> id year month element d1 d2 d3 d4 d5 d6 d7
#> <chr> <int> <int> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 MX17004 2010 1 tmax NA NA NA NA NA NA NA
#> 2 MX17004 2010 1 tmin NA NA NA NA NA NA NA
#> 3 MX17004 2010 2 tmax NA 27.3 24.1 NA NA NA NA
#> 4 MX17004 2010 2 tmin NA 14.4 14.4 NA NA NA NA
#> 5 MX17004 2010 3 tmax NA NA NA NA 32.1 NA NA
#> 6 MX17004 2010 3 tmin NA NA NA NA 14.2 NA NA
#> 7 MX17004 2010 4 tmax NA NA NA NA NA NA NA
#> 8 MX17004 2010 4 tmin NA NA NA NA NA NA NA
#> 9 MX17004 2010 5 tmax NA NA NA NA NA NA NA
#> 10 MX17004 2010 5 tmin NA NA NA NA NA NA NA
#> # ... with 12 more rows, and 24 more variables: d8 <dbl>, d9 <chr>,
#> # d10 <dbl>, d11 <dbl>, d12 <chr>, d13 <dbl>, d14 <dbl>, d15 <dbl>,
#> # d16 <dbl>, d17 <dbl>, d18 <chr>, d19 <chr>, d20 <chr>, d21 <chr>,
#> # d22 <chr>, d23 <dbl>, d24 <chr>, d25 <dbl>, d26 <dbl>, d27 <dbl>,
#> # d28 <dbl>, d29 <dbl>, d30 <dbl>, d31 <dbl>
weather %>%
gather("day", "value", d1:d31, na.rm = TRUE) %>%
mutate(day = readr::parse_number(day)) %>%
select(id, year, month, day, element, value) %>%
arrange(id, year, month, day) %>%
spread(key = element, value = value)
#> # A tibble: 33 x 6
#> id year month day tmax tmin
#> <chr> <int> <int> <dbl> <chr> <chr>
#> 1 MX17004 2010 1 30 27.8 14.5
#> 2 MX17004 2010 2 2 27.3 14.4
#> 3 MX17004 2010 2 3 24.1 14.4
#> 4 MX17004 2010 2 11 29.7 13.4
#> 5 MX17004 2010 2 23 29.9 10.7
#> 6 MX17004 2010 3 5 32.1 14.2
#> 7 MX17004 2010 3 10 34.5 16.8
#> 8 MX17004 2010 3 16 31.1 17.6
#> 9 MX17004 2010 4 27 36.3 16.7
#> 10 MX17004 2010 5 27 33.2 18.2
#> # ... with 23 more rows