Yotta 資料視覺化實戰

Tidy data with R

木刻思 - YJ

Packages

library(magrittr)
library(readr)
library(tidyr)
library(dplyr)

Defining tidy data (long-format)

  1. Each variable forms a column.
  2. Each observation forms a row.
  3. Each type of observational unit forms a table.

Tools for reshaping dataframe

Packages to long to wide
tidyr gather spread
reshape2 melt dcast
pandas melt unstsack / pivot_table / pivot
spreadsheets unpivot pivot
databases fold unfold

Example dataset

Wide data

name treatmenta treatmentb
John Smith NA 18
Jane Doe 4 1
Mary Johnson 6 7
treatment John Smith Jane Doe Mary Johnson
a NA 4 6
b 18 1 7

Long data

name treatment n
Jane Doe a 4
Jane Doe b 1
John Smith a NA
John Smith b 18
Mary Johnson a 6
Mary Johnson b 7

Tidying: Tidyr variables

起手式

  1. Specifying variables and values. (it depends)
  2. gather(): Put all values in the cells.
  3. spread(): Put variables back to columns.

gather: to long

data %>% gather(key, value, ...cols_to_gather)

df_wide <- read_csv("http://bit.ly/country-year-wide")
df_wide
#> # A tibble: 3 x 3
#>   country     `1999` `2000`
#>   <chr>        <int>  <int>
#> 1 Afghanistan    745   2666
#> 2 Brazil       37737  80488
#> 3 China       212258 213766

df_wide %>% gather(key = "year", value = "cases", -country)
#> # A tibble: 6 x 3
#>   country     year   cases
#>   <chr>       <chr>  <int>
#> 1 Afghanistan 1999     745
#> 2 Brazil      1999   37737
#> 3 China       1999  212258
#> 4 Afghanistan 2000    2666
#> 5 Brazil      2000   80488
#> 6 China       2000  213766

spread: to wide

data %>% spread(key, value, fill = NA, convert = FALSE, drop = TRUE, sep = NULL)

df_long <- read_csv("http://bit.ly/cases-long")
df_long
#> # A tibble: 12 x 4
#>    country      year type            count
#>    <chr>       <int> <chr>           <int>
#>  1 Afghanistan  1999 cases             745
#>  2 Afghanistan  1999 population   19987071
#>  3 Afghanistan  2000 cases            2666
#>  4 Afghanistan  2000 population   20595360
#>  5 Brazil       1999 cases           37737
#>  6 Brazil       1999 population  172006362
#>  7 Brazil       2000 cases           80488
#>  8 Brazil       2000 population  174504898
#>  9 China        1999 cases          212258
#> 10 China        1999 population 1272915272
#> 11 China        2000 cases          213766
#> 12 China        2000 population 1280428583

df_long %>% 
  spread(key = type, value = count)
#> # A tibble: 6 x 4
#>   country      year  cases population
#>   <chr>       <int>  <int>      <int>
#> 1 Afghanistan  1999    745   19987071
#> 2 Afghanistan  2000   2666   20595360
#> 3 Brazil       1999  37737  172006362
#> 4 Brazil       2000  80488  174504898
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

seperate: 分割一個 cell 多個值

data %>% seperate(col, into, sep)

容易發生在源頭是 Excel 的資料

df_to_sep <- read_csv("http://bit.ly/sep-raw")
df_to_sep
#> # A tibble: 6 x 3
#>   country      year rate             
#>   <chr>       <int> <chr>            
#> 1 Afghanistan  1999 745/19987071     
#> 2 Afghanistan  2000 2666/20595360    
#> 3 Brazil       1999 37737/172006362  
#> 4 Brazil       2000 80488/174504898  
#> 5 China        1999 212258/1272915272
#> 6 China        2000 213766/1280428583

df_to_sep %>% 
  separate(rate, into = c("cases", "pop"), sep = "/")
#> # A tibble: 6 x 4
#>   country      year cases  pop       
#>   <chr>       <int> <chr>  <chr>     
#> 1 Afghanistan  1999 745    19987071  
#> 2 Afghanistan  2000 2666   20595360  
#> 3 Brazil       1999 37737  172006362 
#> 4 Brazil       2000 80488  174504898 
#> 5 China        1999 212258 1272915272
#> 6 China        2000 213766 1280428583

data %>% seperate_rows(sep) == seperate %>% gather

df_to_sep %>% 
  separate_rows(rate, sep = "/")
#> # A tibble: 12 x 3
#>    country      year rate      
#>    <chr>       <int> <chr>     
#>  1 Afghanistan  1999 745       
#>  2 Afghanistan  1999 19987071  
#>  3 Afghanistan  2000 2666      
#>  4 Afghanistan  2000 20595360  
#>  5 Brazil       1999 37737     
#>  6 Brazil       1999 172006362 
#>  7 Brazil       2000 80488     
#>  8 Brazil       2000 174504898 
#>  9 China        1999 212258    
#> 10 China        1999 1272915272
#> 11 China        2000 213766    
#> 12 China        2000 1280428583

狀況1: 欄位名稱是值 (dataset: pew)

容易發生在源頭是 Excel 的資料

pew <- read_csv("http://bit.ly/pew-raw")
religion <$10k $10-20k $20-30k $30-40k $40-50k $50-75k $75-100k $100-150k >150k Don’t know/refused
Agnostic 27 34 60 81 76 137 122 109 84 96
Atheist 12 27 37 52 35 70 73 59 74 76
Buddhist 27 21 30 34 33 58 62 39 53 54
Catholic 418 617 732 670 638 1116 949 792 633 1489
Don’t know/refused 15 14 15 11 10 35 21 17 18 116
Evangelical Prot 575 869 1064 982 881 1486 949 723 414 1529

3 variables:

  • religion
  • income
  • frequency
?gather

pew %>%
  gather(key = income, value = frequency, -religion)
#> # A tibble: 180 x 3
#>    religion                income frequency
#>    <chr>                   <chr>      <int>
#>  1 Agnostic                <$10k         27
#>  2 Atheist                 <$10k         12
#>  3 Buddhist                <$10k         27
#>  4 Catholic                <$10k        418
#>  5 Don’t know/refused      <$10k         15
#>  6 Evangelical Prot        <$10k        575
#>  7 Hindu                   <$10k          1
#>  8 Historically Black Prot <$10k        228
#>  9 Jehovah's Witness       <$10k         20
#> 10 Jewish                  <$10k         19
#> # ... with 170 more rows

狀況2: 複合型欄位名 (dataset: tb)

tb <- read_csv("http://bit.ly/tb-raw")
tb
#> # A tibble: 5,769 x 22
#>    iso2   year   m04  m514  m014 m1524 m2534 m3544 m4554 m5564   m65    mu
#>    <chr> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int> <int>
#>  1 AD     1989    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  2 AD     1990    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  3 AD     1991    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  4 AD     1992    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  5 AD     1993    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  6 AD     1994    NA    NA    NA    NA    NA    NA    NA    NA    NA    NA
#>  7 AD     1996    NA    NA     0     0     0     4     1     0     0    NA
#>  8 AD     1997    NA    NA     0     0     1     2     2     1     6    NA
#>  9 AD     1998    NA    NA     0     0     0     1     0     0     0    NA
#> 10 AD     1999    NA    NA     0     0     0     1     1     0     0    NA
#> # ... with 5,759 more rows, and 10 more variables: f04 <int>, f514 <int>,
#> #   f014 <int>, f1524 <int>, f2534 <int>, f3544 <int>, f4554 <int>,
#> #   f5564 <int>, f65 <int>, fu <int>

tb %>% 
  gather(key = "sex_and_age", value = "cases", m04:fu) %>% 
  drop_na(cases) %>% 
  separate(sex_and_age, into = c("sex", "age"), sep = 1)
#> # A tibble: 35,750 x 5
#>    iso2   year sex   age   cases
#>    <chr> <int> <chr> <chr> <int>
#>  1 AD     2005 m     04        0
#>  2 AD     2006 m     04        0
#>  3 AD     2008 m     04        0
#>  4 AE     2006 m     04        0
#>  5 AE     2007 m     04        0
#>  6 AE     2008 m     04        0
#>  7 AG     2007 m     04        0
#>  8 AL     2005 m     04        0
#>  9 AL     2006 m     04        1
#> 10 AL     2007 m     04        0
#> # ... with 35,740 more rows

狀況3: 欄位散佈在欄和列 (dataset: weather)

weather <- read_csv("http://bit.ly/weather-raw")
weather
#> # A tibble: 22 x 35
#>    id       year month element    d1    d2    d3    d4    d5    d6    d7
#>    <chr>   <int> <int> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#>  1 MX17004  2010     1 tmax       NA  NA    NA      NA  NA      NA    NA
#>  2 MX17004  2010     1 tmin       NA  NA    NA      NA  NA      NA    NA
#>  3 MX17004  2010     2 tmax       NA  27.3  24.1    NA  NA      NA    NA
#>  4 MX17004  2010     2 tmin       NA  14.4  14.4    NA  NA      NA    NA
#>  5 MX17004  2010     3 tmax       NA  NA    NA      NA  32.1    NA    NA
#>  6 MX17004  2010     3 tmin       NA  NA    NA      NA  14.2    NA    NA
#>  7 MX17004  2010     4 tmax       NA  NA    NA      NA  NA      NA    NA
#>  8 MX17004  2010     4 tmin       NA  NA    NA      NA  NA      NA    NA
#>  9 MX17004  2010     5 tmax       NA  NA    NA      NA  NA      NA    NA
#> 10 MX17004  2010     5 tmin       NA  NA    NA      NA  NA      NA    NA
#> # ... with 12 more rows, and 24 more variables: d8 <dbl>, d9 <chr>,
#> #   d10 <dbl>, d11 <dbl>, d12 <chr>, d13 <dbl>, d14 <dbl>, d15 <dbl>,
#> #   d16 <dbl>, d17 <dbl>, d18 <chr>, d19 <chr>, d20 <chr>, d21 <chr>,
#> #   d22 <chr>, d23 <dbl>, d24 <chr>, d25 <dbl>, d26 <dbl>, d27 <dbl>,
#> #   d28 <dbl>, d29 <dbl>, d30 <dbl>, d31 <dbl>

weather %>%
  gather("day", "value", d1:d31, na.rm = TRUE) %>% 
  mutate(day = readr::parse_number(day)) %>%
  select(id, year, month, day, element, value) %>%
  arrange(id, year, month, day) %>% 
  spread(key = element, value = value)
#> # A tibble: 33 x 6
#>    id       year month   day tmax  tmin 
#>    <chr>   <int> <int> <dbl> <chr> <chr>
#>  1 MX17004  2010     1    30 27.8  14.5 
#>  2 MX17004  2010     2     2 27.3  14.4 
#>  3 MX17004  2010     2     3 24.1  14.4 
#>  4 MX17004  2010     2    11 29.7  13.4 
#>  5 MX17004  2010     2    23 29.9  10.7 
#>  6 MX17004  2010     3     5 32.1  14.2 
#>  7 MX17004  2010     3    10 34.5  16.8 
#>  8 MX17004  2010     3    16 31.1  17.6 
#>  9 MX17004  2010     4    27 36.3  16.7 
#> 10 MX17004  2010     5    27 33.2  18.2 
#> # ... with 23 more rows