Yotta 資料視覺化實戰

ggplot2 - movie + 711

木刻思 - YJ

Load packages

library(magrittr)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)

old_theme <- theme_set(theme_light())

如何匯入資料

7-11 dataset

stores <- read_csv("http://bit.ly/711-stores")
stores
#> # A tibble: 5,175 x 36
#>    Address FaxNo  POIID POIName SpecialStore_Ki… Store_URL Telno      X
#>    <chr>   <chr>  <int> <chr>   <chr>            <chr>     <chr>  <dbl>
#>  1 台北市松山區… (02)… 170945 上弘    <NA>             <NA>      (02)… 1.22e8
#>  2 台北市松山區… (02)… 960661 中崙    <NA>             <NA>      (02)… 1.22e8
#>  3 台北市松山區… (02)… 158923 北體    <NA>             <NA>      (02)… 1.22e8
#>  4 台北市松山區… (02)… 159340 台場    <NA>             <NA>      (02)… 1.22e8
#>  5 台北市松山區… <NA>  161387 巨蛋站  <NA>             <NA>      (02)… 1.22e8
#>  6 台北市松山區… (02)… 138583 市大    <NA>             <NA>      (02)… 1.22e8
#>  7 台北市松山區… (02)… 170082 民北    <NA>             <NA>      (02)… 1.22e8
#>  8 台北市松山區… (02)… 961295 民有    <NA>             <NA>      (02)… 1.22e8
#>  9 台北市松山區… (02)… 177142 民復    <NA>             <NA>      (02)… 1.22e8
#> 10 台北市松山區… (02)… 166832 吉仁    <NA>             <NA>      (02)… 1.22e8
#> # ... with 5,165 more rows, and 28 more variables: Y <dbl>,
#> #   city_name <chr>, is7WiFi <chr>, isATM <chr>, isCityCafe <chr>,
#> #   isCorn <chr>, isDining <chr>, isFruit <chr>, isHealthStations <chr>,
#> #   isHotDog <chr>, isIbon <chr>, isIce <chr>, isIceCream <chr>,
#> #   isLavatory <chr>, isMakeup <chr>, isMisterDonuts <chr>, isMuji <chr>,
#> #   isOpenStore <chr>, isOrganic <chr>, isParking <chr>,
#> #   isStarBucks <chr>, isSweetPotato <chr>, isTea <chr>, isUnionPay <chr>,
#> #   town_name <chr>, logitude <dbl>, latitude <dbl>, company <chr>

不同縣市 (city_name) 的店家數

stores %>% 
  ggplot(aes(x = reorder(city_name, table(city_name)[city_name]))) +
  geom_bar() +
  xlab(NULL) +
  coord_flip() +
  theme(text = element_text(family = "Noto Sans CJK TC")) +
  ggtitle("各縣市店家數")

縣市 (city_name) vs (isATM, isHotDog)

stores %>% 
  ggplot(aes(x = reorder(city_name, table(city_name)[city_name]), 
             fill = isATM)) +
  geom_bar() +
  xlab(NULL) +
  coord_flip() +
  theme(text = element_text(family = "Noto Sans CJK TC")) +
  ggtitle("各縣市店家數")

stores %>% 
  ggplot(aes(x = reorder(city_name, table(city_name)[city_name]), 
             fill = isHotDog)) +
  geom_bar() +
  xlab(NULL) +
  coord_flip() +
  theme(text = element_text(family = "Noto Sans CJK TC")) +
  ggtitle("各縣市店家數")

縣市 (city_name) vs “is” columns

d <- stores %>% 
  select(city_name, starts_with("is")) %>% 
  gather(key = "is_cols", value = "yn", -city_name, na.rm = TRUE) %>%
  mutate(yn = yn %>% recode(Y = 1, N = 0)) %>% 
  group_by(city_name, is_cols) %>% 
  summarise(prop = mean(yn))

d %>% head()
#> # A tibble: 6 x 3
#> # Groups:   city_name [1]
#>   city_name is_cols      prop
#>   <chr>     <chr>       <dbl>
#> 1 南投縣    is7WiFi    0.978 
#> 2 南投縣    isATM      0.966 
#> 3 南投縣    isCityCafe 0.382 
#> 4 南投縣    isCorn     0.0337
#> 5 南投縣    isDining   0.989 
#> 6 南投縣    isFruit    0.0562

d %>% 
  ggplot(aes(x = city_name, y = is_cols, fill = prop)) +
  geom_tile(color = "white") +
  scale_fill_distiller(direction = 1) +
  scale_x_discrete(name = NULL, position = "top") +
  theme(
    text = element_text(family = "Noto Sans CJK TC"),
    axis.text.x = element_text(angle = 90, hjust = 1)
  ) +
  ggtitle("各縣市店家配備率")

Movie dataset

movie <- read_csv("http://bit.ly/movie-main") %>% 
  mutate(movie_id = movie_id %>% as.character)

genre <- read_csv("http://bit.ly/movie-genre") %>% 
  mutate(genre_id = genre_id %>% as.character,
         movie_id = movie_id %>% as.character) %>% 
  rename(genre_name = name)

directors <- read_csv("http://bit.ly/movie-directors") %>% 
  mutate(movie_id = movie_id %>% as.character,
         star_id = star_id %>% as.character)

stars <- read_csv("http://bit.ly/movie-stars") %>% 
  mutate(movie_id = movie_id %>% as.character,
         star_id = star_id %>% as.character)

duration histogram

movie %>% 
  filter(duration > 1) %>% 
  ggplot(aes(x = duration)) +
  geom_histogram(binwidth = 10, color = "white") +
  scale_x_continuous(breaks = seq(0, 250, 30),
                     minor_breaks = seq(0, 250, 10))

Top movies: imdb_score >= 9

movie %>% 
  ggplot(aes(x = imdb_score)) +
  geom_histogram(binwidth = 1, color = "white") +
  scale_x_continuous(breaks = seq(0, 10, 1))

movie %>% 
  filter(imdb_score > 7.5) %>% 
  select(name_in_ch, imdb_score)
#> # A tibble: 20 x 2
#>    name_in_ch                 imdb_score
#>    <chr>                           <dbl>
#>  1 謝謝你,在世界的角落找到我        8.1
#>  2 敦克爾克大行動                    8.5
#>  3 猩球崛起:終極決戰                7.9
#>  4 蜘蛛人:返校日                    8  
#>  5 佈局                              8  
#>  6 神力女超人                        7.9
#>  7 天才的禮物                        7.7
#>  8 星際異攻隊2                       8.1
#>  9 逃出絕命鎮                        7.8
#> 10 目擊者                            7.9
#> 11 明天,我要和昨天的妳約會          7.7
#> 12 愛情昏迷中                        8.1
#> 13 宣告黎明的露之歌                  7.9
#> 14 玩命再劫                          8.2
#> 15 極地追擊                          7.8
#> 16 刀劍神域劇場版-序列爭戰          7.7
#> 17 鬥陣俱樂部                        8.8
#> 18 我和我的冠軍女兒                  8.7
#> 19 少女與戰車劇場版(極爆版)        7.7
#> 20 穆荷蘭大道                        8

電影類型

genre %>% 
  group_by(genre_name) %>% 
  tally(sort = TRUE)
#> # A tibble: 18 x 2
#>    genre_name     n
#>    <chr>      <int>
#>  1 劇情         357
#>  2 懸疑/驚悚    134
#>  3 動作         104
#>  4 喜劇          89
#>  5 愛情          75
#>  6 恐怖          62
#>  7 動畫          55
#>  8 冒險          51
#>  9 科幻          38
#> 10 奇幻          32
#> 11 犯罪          30
#> 12 紀錄片        28
#> 13 音樂/歌舞     22
#> 14 歷史/傳記     20
#> 15 溫馨/家庭     17
#> 16 戰爭          11
#> 17 勵志           6
#> 18 影展           6

時間長度 vs 電影類型

d <- movie %>% 
  filter(duration > 1) %>% 
  select(movie_id, duration) %>% 
  inner_join(genre, by = "movie_id") %>% 
  group_by(genre_name) %>%
  summarise(
    min_duration = min(duration),
    max_duration = max(duration),
    mean_duration = mean(duration)
  ) %>% 
  mutate(genre_name = reorder(genre_name, mean_duration))
d
#> # A tibble: 17 x 4
#>    genre_name min_duration max_duration mean_duration
#>    <fct>             <dbl>        <dbl>         <dbl>
#>  1 冒險                 60          152         114. 
#>  2 劇情                 63          214         117. 
#>  3 動作                 87          167         120. 
#>  4 動畫                 60          130          99.9
#>  5 勵志                109          161         130. 
#>  6 喜劇                 86          153         108. 
#>  7 奇幻                 87          167         116. 
#>  8 恐怖                 79          135          96.2
#>  9 愛情                 77          184         109  
#> 10 懸疑/驚悚            79          138         104. 
#> 11 戰爭                 90          130         110. 
#> 12 歷史/傳記            90          161         124. 
#> 13 溫馨/家庭            95          131         117. 
#> 14 犯罪                 82          138         111. 
#> 15 科幻                 82          152         113. 
#> 16 紀錄片               70          197         104. 
#> 17 音樂/歌舞            93          151         114.

d %>% 
  gather(key = "stat", value = "value", -genre_name) %>%
  ggplot() +
  geom_segment(
    data = d,
    aes(x = min_duration, xend = max_duration,
        y = genre_name, yend = genre_name),
    color = "lightgrey",
    size = 3) +
  geom_point(aes(y = genre_name, x = value, color = stat),
             size = 3, shape = 15) +
  scale_x_continuous(breaks = seq(0, 220, 30),
                     minor_breaks = seq(0, 220, 10)) +
  theme(text = element_text(family = "Noto Sans CJK TC"),
        legend.position = "top")

各月份的電影上映數 + 類型

d <- movie %>% 
  filter(duration > 1) %>% 
  select(movie_id, release_date_str) %>% 
  inner_join(genre, by = "movie_id") %>% 
  mutate(YYMM = scales::date_format("%y-%m")(release_date_str)) %>%
  filter(YYMM >= "17-01" & YYMM < "18-01") %>% 
  group_by(genre_name, YYMM) %>% 
  tally()

d
#> # A tibble: 119 x 3
#> # Groups:   genre_name [?]
#>    genre_name YYMM      n
#>    <chr>      <chr> <int>
#>  1 冒險       17-02     1
#>  2 冒險       17-03     1
#>  3 冒險       17-04     2
#>  4 冒險       17-05     1
#>  5 冒險       17-06     3
#>  6 冒險       17-07     4
#>  7 冒險       17-08     2
#>  8 冒險       17-09     1
#>  9 冒險       17-11     1
#> 10 冒險       17-12     2
#> # ... with 109 more rows

d %>% 
  ggplot(aes(YYMM, y = n)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ genre_name, ncol = 3) +
  ggtitle("每月電影上映數") +
  theme(
    text = element_text(family = "Noto Sans CJK TC"),
    axis.text.x = element_text(angle = 90, hjust = 1)
  )