knitr::opts_chunk$set(echo = TRUE)
library(magrittr)
library(dplyr)
library(pivottabler)
library(tidyverse)
library(ggridges)
library(modelr)
library(mgcv)
knitr::opts_chunk$set(
fig.width = 6,
fig.asp = .6,
out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
ggplot2.continuous.colour = "viridis",
ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d
city_30_df_pm25 = tibble(
file = list.files("30_cities_data")) %>%
mutate(
city = str_remove(file, "-air-quality.csv"),
path = str_c("30_cities_data/", file),
data = map(path, read_csv)
) %>%
unnest(data) %>%
select(-file, -path) %>%
mutate(
city = str_to_title(city),
date = as.Date(date, format = "%Y/%m/%d")) %>%
select(city,date,pm25)
pm25_2020 =
city_30_df_pm25 %>%
filter(date > "2020-01-31" & date < "2020-05-01") %>%
mutate(date = format(date, format = "%y-%m-%d")) %>%
select(city, date, pm25)
pm25_2019 =
city_30_df_pm25 %>%
filter(date > "2019-01-31" & date < "2019-05-01") %>%
mutate(date = format(date, format = "%y-%m-%d")) %>%
select(city, date, pm25)
pm25_2018 =
city_30_df_pm25 %>%
filter(date > "2018-01-31" & date < "2018-05-01") %>%
mutate(date = format(date, format = "%y-%m-%d")) %>%
select(city, date, pm25)
pm25_2017 =
city_30_df_pm25 %>%
filter(date > "2017-01-31" & date < "2017-05-01") %>%
mutate(date = format(date, format = "%y-%m-%d")) %>%
select(city, date, pm25)
pm25_171819 = rbind(pm25_2017,pm25_2018,pm25_2019)
pm25_1920 = rbind(pm25_2020,pm25_2019)
Are cities and air quality level dependent at 5% level of significance? In other words, given the data collected above, is there a relationship between the cities and the level of air quality that they have obtained?
Null hypothesis (H0): the air quality level and the different cities variables of the contingency table are independent in 2020 between February to April.
Alternative hypothesis (H1): the air quality level and the different cities variables of the contingency table are dependent in 2020 between February to April.
city_PM25 = pm25_2020 %>%
drop_na() %>%
select(city,pm25) %>%
mutate(level = as.character(pm25)) %>%
mutate(
level = case_when(
pm25 <= 50 ~ 'Good',
pm25 <= 100 ~ 'Moderate',
pm25 <= 150 ~ 'Unhealthy for Sensitive People',
pm25 <= 200 ~ ' Unhealthy',
pm25 <= 300 ~ 'Very Unhealthy',
pm25 <= 500 ~ 'Hazardous')) %>%
arrange(city,level,pm25)
city_level =
city_PM25 %>%
group_by(city,level) %>%
summarise(n = n()) %>%
pivot_wider(names_from = "level", values_from = "n")
city_level[is.na(city_level)] = 0
airquality_level = read.csv("test (Fei)/city_air_quality_level.csv", row.names = 1 )
knitr::kable(airquality_level)
Good | Moderate | Unhealthy.for.Sensitive.People | Unhealthy | Very.Unhealthy | Hazardous | Total | |
---|---|---|---|---|---|---|---|
Beijing | 19 | 31 | 23 | 14 | 3 | 0 | 90 |
Changchun | 2 | 29 | 37 | 8 | 7 | 5 | 88 |
Changsha | 2 | 17 | 54 | 15 | 0 | 0 | 88 |
Chengdu | 1 | 22 | 47 | 18 | 0 | 0 | 88 |
Chongqing | 0 | 20 | 53 | 15 | 0 | 0 | 88 |
Fuzhou | 9 | 56 | 23 | 0 | 0 | 0 | 88 |
Guangzhou | 17 | 52 | 16 | 3 | 0 | 0 | 88 |
Guiyang | 0 | 42 | 43 | 3 | 0 | 0 | 88 |
Harbin | 7 | 36 | 25 | 12 | 6 | 1 | 87 |
Hefei | 3 | 45 | 36 | 4 | 0 | 0 | 88 |
Jinan | 1 | 21 | 47 | 19 | 0 | 0 | 88 |
Kunming | 1 | 50 | 36 | 1 | 0 | 0 | 88 |
Lanzhou | 0 | 26 | 62 | 0 | 0 | 0 | 88 |
Lhasa | 56 | 32 | 0 | 0 | 0 | 0 | 88 |
Nanchang | 0 | 28 | 50 | 10 | 0 | 0 | 88 |
Nanjing | 0 | 42 | 39 | 4 | 0 | 0 | 85 |
Nanning | 10 | 52 | 22 | 4 | 0 | 0 | 88 |
Shanghai | 5 | 49 | 29 | 3 | 1 | 1 | 88 |
Shenyang | 1 | 28 | 33 | 24 | 2 | 0 | 88 |
Shenzhen | 2 | 67 | 17 | 2 | 0 | 0 | 88 |
Shijiazhuang | 0 | 21 | 37 | 24 | 6 | 0 | 88 |
Suzhou | 0 | 33 | 43 | 12 | 0 | 0 | 88 |
Taiyuan | 2 | 29 | 32 | 20 | 5 | 0 | 88 |
Tianjin | 9 | 29 | 27 | 21 | 2 | 0 | 88 |
Wuhan | 1 | 20 | 55 | 12 | 0 | 0 | 88 |
Wulumuqi | 3 | 46 | 22 | 11 | 4 | 2 | 88 |
Xian | 0 | 19 | 44 | 19 | 6 | 0 | 88 |
Xining | 0 | 28 | 58 | 2 | 0 | 0 | 88 |
Yinchuan | 0 | 57 | 24 | 7 | 0 | 0 | 88 |
Zhengzhou | 2 | 18 | 47 | 19 | 2 | 0 | 88 |
Total | 153 | 1045 | 1081 | 306 | 44 | 9 | 2638 |
chisq.test(airquality_level, simulate.p.value = TRUE)
##
## Pearson's Chi-squared test with simulated p-value (based on 2000
## replicates)
##
## data: airquality_level
## X-squared = 1363.2, df = NA, p-value = 0.0004998
qchisq(0.05, 174, lower.tail = TRUE)
## [1] 144.494
For a Chi-square test, the p-value(0.0004998) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the air quality level and the different cities variables of the contingency table are independent and there is a relationship between them.
Null hypothesis (H0): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero.
Alternative hypothesis (H1): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is not equal to zero.
mean_20 = pm25_2020 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(mean = mean(pm25)) %>%
select(mean) %>%
pull()
mean_171819 = pm25_171819 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(mean = mean(pm25)) %>%
select(mean) %>%
pull()
table1 = pm25_2020 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(mean_2020 = mean(pm25))
table2 = pm25_171819 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(mean_2017_2018_2019 = mean(pm25)) %>%
select(-city)
table3 = pm25_2020 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(standard_deviation_2020 = sd(pm25))
table4 = pm25_171819 %>%
drop_na() %>%
select(-date) %>%
group_by(city) %>%
summarise(standard_deviation_2017_2018_2019 = sd(pm25)) %>%
select(-city)
knitr::kable(bind_cols(table1,table2))
city | mean_2020 | mean_2017_2018_2019 |
---|---|---|
Beijing | 101.08889 | 122.56180 |
Changchun | 129.68182 | 130.83146 |
Changsha | 119.76136 | 131.89139 |
Chengdu | 120.07955 | 138.58052 |
Chongqing | 118.37500 | 122.62547 |
Fuzhou | 85.51136 | 101.99625 |
Guangzhou | 79.71591 | 104.67416 |
Guiyang | 104.68182 | 119.70037 |
Harbin | 118.10227 | 128.25468 |
Hefei | 99.15909 | 133.51311 |
Jinan | 121.12500 | 153.87640 |
Kunming | 93.87500 | 107.58801 |
Lanzhou | 109.44318 | 127.83895 |
Lhasa | 47.64773 | 70.38202 |
Nanchang | 117.04545 | 122.91011 |
Nanjing | 103.63529 | 131.98876 |
Nanning | 89.17045 | 102.31461 |
Shanghai | 96.43182 | 121.94382 |
Shenyang | 124.36364 | 131.01124 |
Shenzhen | 86.70455 | 108.02622 |
Shijiazhuang | 132.56818 | 170.88015 |
Suzhou | 112.28409 | 138.64419 |
Taiyuan | 123.22727 | 141.02622 |
Tianjin | 112.10227 | 131.56015 |
Wuhan | 119.30682 | 148.14286 |
Wulumuqi | 111.55682 | 153.50562 |
Xian | 130.88636 | 156.21348 |
Xining | 112.95455 | 125.71536 |
Yinchuan | 95.00000 | 113.98876 |
Zhengzhou | 125.65909 | 153.83895 |
knitr::kable(bind_cols(table3,table4))
city | standard_deviation_2020 | standard_deviation_2017_2018_2019 |
---|---|---|
Beijing | 53.622410 | 58.27449 |
Changchun | 71.154930 | 41.89880 |
Changsha | 31.946433 | 35.77939 |
Chengdu | 31.925280 | 31.39782 |
Chongqing | 27.455842 | 32.98602 |
Fuzhou | 26.220547 | 31.28411 |
Guangzhou | 30.560266 | 32.54474 |
Guiyang | 25.447510 | 27.04869 |
Harbin | 82.209578 | 51.92377 |
Hefei | 26.941779 | 29.62705 |
Jinan | 33.247872 | 35.76158 |
Kunming | 25.200717 | 27.87814 |
Lanzhou | 17.747823 | 25.46043 |
Lhasa | 9.629982 | 20.77276 |
Nanchang | 27.839404 | 33.82973 |
Nanjing | 25.560780 | 29.50181 |
Nanning | 33.432802 | 30.92044 |
Shanghai | 43.508170 | 38.68478 |
Shenyang | 43.895430 | 41.74101 |
Shenzhen | 22.903676 | 26.22566 |
Shijiazhuang | 45.105444 | 66.73903 |
Suzhou | 29.967396 | 33.18705 |
Taiyuan | 45.217159 | 45.75691 |
Tianjin | 45.379888 | 55.96237 |
Wuhan | 28.383806 | 30.04290 |
Wulumuqi | 55.372689 | 76.66299 |
Xian | 37.331516 | 49.79993 |
Xining | 19.740215 | 29.23179 |
Yinchuan | 30.217982 | 33.16596 |
Zhengzhou | 34.516325 | 46.23868 |
t.test(mean_20, mean_171819, paired = T)
##
## Paired t-test
##
## data: mean_20 and mean_171819
## t = -11.116, df = 29, p-value = 5.679e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -23.87243 -16.45294
## sample estimates:
## mean of the differences
## -20.16268
qt(0.05, 29)
## [1] -1.699127
For a Two-Sample Paired T-test, the p-value(5.679e-12) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero and there are different mean for each 30 cities between 2020 and 2017-2019.