knitr::opts_chunk$set(echo = TRUE)

library(magrittr)
library(dplyr)
library(pivottabler)
library(tidyverse)
library(ggridges)
library(modelr)
library(mgcv)
knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

city_30_df_pm25 = tibble(
  file = list.files("30_cities_data")) %>% 
  mutate(
    city = str_remove(file, "-air-quality.csv"),
    path = str_c("30_cities_data/", file),
    data = map(path, read_csv)
  ) %>% 
  unnest(data) %>% 
  select(-file, -path) %>% 
  mutate(
    city = str_to_title(city),
    date = as.Date(date, format = "%Y/%m/%d")) %>% 
  select(city,date,pm25)

pm25_2020 = 
  city_30_df_pm25 %>% 
  filter(date > "2020-01-31" & date < "2020-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)


pm25_2019 = 
  city_30_df_pm25 %>% 
  filter(date > "2019-01-31" & date < "2019-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_2018 = 
  city_30_df_pm25 %>% 
  filter(date > "2018-01-31" & date < "2018-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_2017 = 
  city_30_df_pm25 %>% 
  filter(date > "2017-01-31" & date < "2017-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_171819 = rbind(pm25_2017,pm25_2018,pm25_2019)

pm25_1920 = rbind(pm25_2020,pm25_2019)

Chi-Squared Test

Are cities and air quality level dependent at 5% level of significance? In other words, given the data collected above, is there a relationship between the cities and the level of air quality that they have obtained?

Null hypothesis (H0): the air quality level and the different cities variables of the contingency table are independent in 2020 between February to April.

Alternative hypothesis (H1): the air quality level and the different cities variables of the contingency table are dependent in 2020 between February to April.

city_PM25 = pm25_2020 %>%
  drop_na() %>%
  select(city,pm25) %>%
  mutate(level = as.character(pm25)) %>%
  mutate(
    level = case_when(
      pm25 <= 50 ~ 'Good',
      pm25 <= 100 ~ 'Moderate',
      pm25 <= 150 ~ 'Unhealthy for Sensitive People',
      pm25 <= 200 ~ ' Unhealthy',
      pm25 <= 300 ~ 'Very Unhealthy',
      pm25 <= 500 ~ 'Hazardous')) %>%
  arrange(city,level,pm25)

city_level =
  city_PM25 %>%
  group_by(city,level) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = "level", values_from = "n")

city_level[is.na(city_level)] = 0 

Air Quality Level Table

airquality_level = read.csv("test (Fei)/city_air_quality_level.csv", row.names = 1 )
knitr::kable(airquality_level)
Good Moderate Unhealthy.for.Sensitive.People Unhealthy Very.Unhealthy Hazardous Total
Beijing 19 31 23 14 3 0 90
Changchun 2 29 37 8 7 5 88
Changsha 2 17 54 15 0 0 88
Chengdu 1 22 47 18 0 0 88
Chongqing 0 20 53 15 0 0 88
Fuzhou 9 56 23 0 0 0 88
Guangzhou 17 52 16 3 0 0 88
Guiyang 0 42 43 3 0 0 88
Harbin 7 36 25 12 6 1 87
Hefei 3 45 36 4 0 0 88
Jinan 1 21 47 19 0 0 88
Kunming 1 50 36 1 0 0 88
Lanzhou 0 26 62 0 0 0 88
Lhasa 56 32 0 0 0 0 88
Nanchang 0 28 50 10 0 0 88
Nanjing 0 42 39 4 0 0 85
Nanning 10 52 22 4 0 0 88
Shanghai 5 49 29 3 1 1 88
Shenyang 1 28 33 24 2 0 88
Shenzhen 2 67 17 2 0 0 88
Shijiazhuang 0 21 37 24 6 0 88
Suzhou 0 33 43 12 0 0 88
Taiyuan 2 29 32 20 5 0 88
Tianjin 9 29 27 21 2 0 88
Wuhan 1 20 55 12 0 0 88
Wulumuqi 3 46 22 11 4 2 88
Xian 0 19 44 19 6 0 88
Xining 0 28 58 2 0 0 88
Yinchuan 0 57 24 7 0 0 88
Zhengzhou 2 18 47 19 2 0 88
Total 153 1045 1081 306 44 9 2638
chisq.test(airquality_level, simulate.p.value = TRUE)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  airquality_level
## X-squared = 1363.2, df = NA, p-value = 0.0004998
qchisq(0.05, 174, lower.tail = TRUE) 
## [1] 144.494

Chi-Squared Test Result

For a Chi-square test, the p-value(0.0004998) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the air quality level and the different cities variables of the contingency table are independent and there is a relationship between them.

Two-Sample Paired T-test

Null hypothesis (H0): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero.

Alternative hypothesis (H1): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is not equal to zero.

mean_20 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean = mean(pm25)) %>%
  select(mean) %>%
  pull()


mean_171819 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean = mean(pm25)) %>%
  select(mean) %>%
  pull()

table1 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean_2020 = mean(pm25))

table2 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean_2017_2018_2019 = mean(pm25)) %>%
  select(-city) 

table3 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(standard_deviation_2020 = sd(pm25))

table4 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(standard_deviation_2017_2018_2019 = sd(pm25)) %>%
  select(-city) 

T Test Table

Mean Table

knitr::kable(bind_cols(table1,table2))
city mean_2020 mean_2017_2018_2019
Beijing 101.08889 122.56180
Changchun 129.68182 130.83146
Changsha 119.76136 131.89139
Chengdu 120.07955 138.58052
Chongqing 118.37500 122.62547
Fuzhou 85.51136 101.99625
Guangzhou 79.71591 104.67416
Guiyang 104.68182 119.70037
Harbin 118.10227 128.25468
Hefei 99.15909 133.51311
Jinan 121.12500 153.87640
Kunming 93.87500 107.58801
Lanzhou 109.44318 127.83895
Lhasa 47.64773 70.38202
Nanchang 117.04545 122.91011
Nanjing 103.63529 131.98876
Nanning 89.17045 102.31461
Shanghai 96.43182 121.94382
Shenyang 124.36364 131.01124
Shenzhen 86.70455 108.02622
Shijiazhuang 132.56818 170.88015
Suzhou 112.28409 138.64419
Taiyuan 123.22727 141.02622
Tianjin 112.10227 131.56015
Wuhan 119.30682 148.14286
Wulumuqi 111.55682 153.50562
Xian 130.88636 156.21348
Xining 112.95455 125.71536
Yinchuan 95.00000 113.98876
Zhengzhou 125.65909 153.83895

Standard Deviation Table

knitr::kable(bind_cols(table3,table4))
city standard_deviation_2020 standard_deviation_2017_2018_2019
Beijing 53.622410 58.27449
Changchun 71.154930 41.89880
Changsha 31.946433 35.77939
Chengdu 31.925280 31.39782
Chongqing 27.455842 32.98602
Fuzhou 26.220547 31.28411
Guangzhou 30.560266 32.54474
Guiyang 25.447510 27.04869
Harbin 82.209578 51.92377
Hefei 26.941779 29.62705
Jinan 33.247872 35.76158
Kunming 25.200717 27.87814
Lanzhou 17.747823 25.46043
Lhasa 9.629982 20.77276
Nanchang 27.839404 33.82973
Nanjing 25.560780 29.50181
Nanning 33.432802 30.92044
Shanghai 43.508170 38.68478
Shenyang 43.895430 41.74101
Shenzhen 22.903676 26.22566
Shijiazhuang 45.105444 66.73903
Suzhou 29.967396 33.18705
Taiyuan 45.217159 45.75691
Tianjin 45.379888 55.96237
Wuhan 28.383806 30.04290
Wulumuqi 55.372689 76.66299
Xian 37.331516 49.79993
Xining 19.740215 29.23179
Yinchuan 30.217982 33.16596
Zhengzhou 34.516325 46.23868
t.test(mean_20, mean_171819, paired = T)
## 
##  Paired t-test
## 
## data:  mean_20 and mean_171819
## t = -11.116, df = 29, p-value = 5.679e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.87243 -16.45294
## sample estimates:
## mean of the differences 
##               -20.16268
qt(0.05, 29)
## [1] -1.699127

T Test Result

For a Two-Sample Paired T-test, the p-value(5.679e-12) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero and there are different mean for each 30 cities between 2020 and 2017-2019.