Load packages
library(magrittr)
library(readr)
library(dplyr)
library(tidyr)
library(ggplot2)
library(ggrepel)
old_theme <- theme_set(theme_light())如何匯入資料
7-11 dataset
stores <- read_csv("http://bit.ly/711-stores")
stores
#> # A tibble: 5,175 x 36
#> Address FaxNo POIID POIName SpecialStore_Ki… Store_URL Telno X
#> <chr> <chr> <int> <chr> <chr> <chr> <chr> <dbl>
#> 1 台北市松山區… (02)… 170945 上弘 <NA> <NA> (02)… 1.22e8
#> 2 台北市松山區… (02)… 960661 中崙 <NA> <NA> (02)… 1.22e8
#> 3 台北市松山區… (02)… 158923 北體 <NA> <NA> (02)… 1.22e8
#> 4 台北市松山區… (02)… 159340 台場 <NA> <NA> (02)… 1.22e8
#> 5 台北市松山區… <NA> 161387 巨蛋站 <NA> <NA> (02)… 1.22e8
#> 6 台北市松山區… (02)… 138583 市大 <NA> <NA> (02)… 1.22e8
#> 7 台北市松山區… (02)… 170082 民北 <NA> <NA> (02)… 1.22e8
#> 8 台北市松山區… (02)… 961295 民有 <NA> <NA> (02)… 1.22e8
#> 9 台北市松山區… (02)… 177142 民復 <NA> <NA> (02)… 1.22e8
#> 10 台北市松山區… (02)… 166832 吉仁 <NA> <NA> (02)… 1.22e8
#> # ... with 5,165 more rows, and 28 more variables: Y <dbl>,
#> # city_name <chr>, is7WiFi <chr>, isATM <chr>, isCityCafe <chr>,
#> # isCorn <chr>, isDining <chr>, isFruit <chr>, isHealthStations <chr>,
#> # isHotDog <chr>, isIbon <chr>, isIce <chr>, isIceCream <chr>,
#> # isLavatory <chr>, isMakeup <chr>, isMisterDonuts <chr>, isMuji <chr>,
#> # isOpenStore <chr>, isOrganic <chr>, isParking <chr>,
#> # isStarBucks <chr>, isSweetPotato <chr>, isTea <chr>, isUnionPay <chr>,
#> # town_name <chr>, logitude <dbl>, latitude <dbl>, company <chr>不同縣市 (city_name) 的店家數
stores %>%
ggplot(aes(x = reorder(city_name, table(city_name)[city_name]))) +
geom_bar() +
xlab(NULL) +
coord_flip() +
theme(text = element_text(family = "Noto Sans CJK TC")) +
ggtitle("各縣市店家數")縣市 (city_name) vs (isATM, isHotDog)
stores %>%
ggplot(aes(x = reorder(city_name, table(city_name)[city_name]),
fill = isATM)) +
geom_bar() +
xlab(NULL) +
coord_flip() +
theme(text = element_text(family = "Noto Sans CJK TC")) +
ggtitle("各縣市店家數")stores %>%
ggplot(aes(x = reorder(city_name, table(city_name)[city_name]),
fill = isHotDog)) +
geom_bar() +
xlab(NULL) +
coord_flip() +
theme(text = element_text(family = "Noto Sans CJK TC")) +
ggtitle("各縣市店家數")縣市 (city_name) vs “is” columns
d <- stores %>%
select(city_name, starts_with("is")) %>%
gather(key = "is_cols", value = "yn", -city_name, na.rm = TRUE) %>%
mutate(yn = yn %>% recode(Y = 1, N = 0)) %>%
group_by(city_name, is_cols) %>%
summarise(prop = mean(yn))
d %>% head()
#> # A tibble: 6 x 3
#> # Groups: city_name [1]
#> city_name is_cols prop
#> <chr> <chr> <dbl>
#> 1 南投縣 is7WiFi 0.978
#> 2 南投縣 isATM 0.966
#> 3 南投縣 isCityCafe 0.382
#> 4 南投縣 isCorn 0.0337
#> 5 南投縣 isDining 0.989
#> 6 南投縣 isFruit 0.0562
d %>%
ggplot(aes(x = city_name, y = is_cols, fill = prop)) +
geom_tile(color = "white") +
scale_fill_distiller(direction = 1) +
scale_x_discrete(name = NULL, position = "top") +
theme(
text = element_text(family = "Noto Sans CJK TC"),
axis.text.x = element_text(angle = 90, hjust = 1)
) +
ggtitle("各縣市店家配備率")Movie dataset
movie <- read_csv("http://bit.ly/movie-main") %>%
mutate(movie_id = movie_id %>% as.character)
genre <- read_csv("http://bit.ly/movie-genre") %>%
mutate(genre_id = genre_id %>% as.character,
movie_id = movie_id %>% as.character) %>%
rename(genre_name = name)
directors <- read_csv("http://bit.ly/movie-directors") %>%
mutate(movie_id = movie_id %>% as.character,
star_id = star_id %>% as.character)
stars <- read_csv("http://bit.ly/movie-stars") %>%
mutate(movie_id = movie_id %>% as.character,
star_id = star_id %>% as.character)duration histogram
movie %>%
filter(duration > 1) %>%
ggplot(aes(x = duration)) +
geom_histogram(binwidth = 10, color = "white") +
scale_x_continuous(breaks = seq(0, 250, 30),
minor_breaks = seq(0, 250, 10))Top movies: imdb_score >= 9
movie %>%
ggplot(aes(x = imdb_score)) +
geom_histogram(binwidth = 1, color = "white") +
scale_x_continuous(breaks = seq(0, 10, 1))movie %>%
filter(imdb_score > 7.5) %>%
select(name_in_ch, imdb_score)
#> # A tibble: 20 x 2
#> name_in_ch imdb_score
#> <chr> <dbl>
#> 1 謝謝你,在世界的角落找到我 8.1
#> 2 敦克爾克大行動 8.5
#> 3 猩球崛起:終極決戰 7.9
#> 4 蜘蛛人:返校日 8
#> 5 佈局 8
#> 6 神力女超人 7.9
#> 7 天才的禮物 7.7
#> 8 星際異攻隊2 8.1
#> 9 逃出絕命鎮 7.8
#> 10 目擊者 7.9
#> 11 明天,我要和昨天的妳約會 7.7
#> 12 愛情昏迷中 8.1
#> 13 宣告黎明的露之歌 7.9
#> 14 玩命再劫 8.2
#> 15 極地追擊 7.8
#> 16 刀劍神域劇場版-序列爭戰 7.7
#> 17 鬥陣俱樂部 8.8
#> 18 我和我的冠軍女兒 8.7
#> 19 少女與戰車劇場版(極爆版) 7.7
#> 20 穆荷蘭大道 8電影類型
genre %>%
group_by(genre_name) %>%
tally(sort = TRUE)
#> # A tibble: 18 x 2
#> genre_name n
#> <chr> <int>
#> 1 劇情 357
#> 2 懸疑/驚悚 134
#> 3 動作 104
#> 4 喜劇 89
#> 5 愛情 75
#> 6 恐怖 62
#> 7 動畫 55
#> 8 冒險 51
#> 9 科幻 38
#> 10 奇幻 32
#> 11 犯罪 30
#> 12 紀錄片 28
#> 13 音樂/歌舞 22
#> 14 歷史/傳記 20
#> 15 溫馨/家庭 17
#> 16 戰爭 11
#> 17 勵志 6
#> 18 影展 6時間長度 vs 電影類型
d <- movie %>%
filter(duration > 1) %>%
select(movie_id, duration) %>%
inner_join(genre, by = "movie_id") %>%
group_by(genre_name) %>%
summarise(
min_duration = min(duration),
max_duration = max(duration),
mean_duration = mean(duration)
) %>%
mutate(genre_name = reorder(genre_name, mean_duration))
d
#> # A tibble: 17 x 4
#> genre_name min_duration max_duration mean_duration
#> <fct> <dbl> <dbl> <dbl>
#> 1 冒險 60 152 114.
#> 2 劇情 63 214 117.
#> 3 動作 87 167 120.
#> 4 動畫 60 130 99.9
#> 5 勵志 109 161 130.
#> 6 喜劇 86 153 108.
#> 7 奇幻 87 167 116.
#> 8 恐怖 79 135 96.2
#> 9 愛情 77 184 109
#> 10 懸疑/驚悚 79 138 104.
#> 11 戰爭 90 130 110.
#> 12 歷史/傳記 90 161 124.
#> 13 溫馨/家庭 95 131 117.
#> 14 犯罪 82 138 111.
#> 15 科幻 82 152 113.
#> 16 紀錄片 70 197 104.
#> 17 音樂/歌舞 93 151 114.
d %>%
gather(key = "stat", value = "value", -genre_name) %>%
ggplot() +
geom_segment(
data = d,
aes(x = min_duration, xend = max_duration,
y = genre_name, yend = genre_name),
color = "lightgrey",
size = 3) +
geom_point(aes(y = genre_name, x = value, color = stat),
size = 3, shape = 15) +
scale_x_continuous(breaks = seq(0, 220, 30),
minor_breaks = seq(0, 220, 10)) +
theme(text = element_text(family = "Noto Sans CJK TC"),
legend.position = "top")各月份的電影上映數 + 類型
d <- movie %>%
filter(duration > 1) %>%
select(movie_id, release_date_str) %>%
inner_join(genre, by = "movie_id") %>%
mutate(YYMM = scales::date_format("%y-%m")(release_date_str)) %>%
filter(YYMM >= "17-01" & YYMM < "18-01") %>%
group_by(genre_name, YYMM) %>%
tally()
d
#> # A tibble: 119 x 3
#> # Groups: genre_name [?]
#> genre_name YYMM n
#> <chr> <chr> <int>
#> 1 冒險 17-02 1
#> 2 冒險 17-03 1
#> 3 冒險 17-04 2
#> 4 冒險 17-05 1
#> 5 冒險 17-06 3
#> 6 冒險 17-07 4
#> 7 冒險 17-08 2
#> 8 冒險 17-09 1
#> 9 冒險 17-11 1
#> 10 冒險 17-12 2
#> # ... with 109 more rows
d %>%
ggplot(aes(YYMM, y = n)) +
geom_bar(stat = "identity") +
facet_wrap(~ genre_name, ncol = 3) +
ggtitle("每月電影上映數") +
theme(
text = element_text(family = "Noto Sans CJK TC"),
axis.text.x = element_text(angle = 90, hjust = 1)
)