knitr::opts_chunk$set(echo = TRUE)

library(magrittr)
library(dplyr)
library(pivottabler)
library(tidyverse)
library(ggridges)
library(modelr)
library(mgcv)
knitr::opts_chunk$set(
  fig.width = 6,
  fig.asp = .6,
  out.width = "90%"
)
theme_set(theme_minimal() + theme(legend.position = "bottom"))
options(
  ggplot2.continuous.colour = "viridis",
  ggplot2.continuous.fill = "viridis"
)
scale_colour_discrete = scale_colour_viridis_d
scale_fill_discrete = scale_fill_viridis_d

city_30_df_pm25 = tibble(
  file = list.files("30_cities_data")) %>% 
  mutate(
    city = str_remove(file, "-air-quality.csv"),
    path = str_c("30_cities_data/", file),
    data = map(path, read_csv)
  ) %>% 
  unnest(data) %>% 
  select(-file, -path) %>% 
  mutate(
    city = str_to_title(city),
    date = as.Date(date, format = "%Y/%m/%d")) %>% 
  select(city,date,pm25)

pm25_2020 = 
  city_30_df_pm25 %>% 
  filter(date > "2020-01-31" & date < "2020-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)


pm25_2019 = 
  city_30_df_pm25 %>% 
  filter(date > "2019-01-31" & date < "2019-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_2018 = 
  city_30_df_pm25 %>% 
  filter(date > "2018-01-31" & date < "2018-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_2017 = 
  city_30_df_pm25 %>% 
  filter(date > "2017-01-31" & date < "2017-05-01") %>% 
  mutate(date = format(date, format = "%y-%m-%d")) %>% 
  select(city, date, pm25)

pm25_171819 = rbind(pm25_2017,pm25_2018,pm25_2019)

pm25_1920 = rbind(pm25_2020,pm25_2019)

Chi-Squared Test

Are cities and air quality level dependent at 5% level of significance? In other words, given the data collected above, is there a relationship between the cities and the level of air quality that they have obtained?

Null hypothesis (H0): the air quality level and the different cities variables of the contingency table are independent in 2020 between February to April.

Alternative hypothesis (H1): the air quality level and the different cities variables of the contingency table are dependent in 2020 between February to April.

city_PM25 = pm25_2020 %>%
  drop_na() %>%
  select(city,pm25) %>%
  mutate(level = as.character(pm25)) %>%
  mutate(
    level = case_when(
      pm25 <= 50 ~ 'Good',
      pm25 <= 100 ~ 'Moderate',
      pm25 <= 150 ~ 'Unhealthy for Sensitive People',
      pm25 <= 200 ~ ' Unhealthy',
      pm25 <= 300 ~ 'Very Unhealthy',
      pm25 <= 500 ~ 'Hazardous')) %>%
  arrange(city,level,pm25)

city_level =
  city_PM25 %>%
  group_by(city,level) %>%
  summarise(n = n()) %>%
  pivot_wider(names_from = "level", values_from = "n")

city_level[is.na(city_level)] = 0

Air Quality Level Table

airquality_level = read.csv("test (Fei)/city_air_quality_level.csv", row.names = 1 )
knitr::kable(airquality_level)

	Good	Moderate	Unhealthy.for.Sensitive.People	Unhealthy	Very.Unhealthy	Hazardous	Total
Beijing	19	31	23	14	3	0	90
Changchun	2	29	37	8	7	5	88
Changsha	2	17	54	15	0	0	88
Chengdu	1	22	47	18	0	0	88
Chongqing	0	20	53	15	0	0	88
Fuzhou	9	56	23	0	0	0	88
Guangzhou	17	52	16	3	0	0	88
Guiyang	0	42	43	3	0	0	88
Harbin	7	36	25	12	6	1	87
Hefei	3	45	36	4	0	0	88
Jinan	1	21	47	19	0	0	88
Kunming	1	50	36	1	0	0	88
Lanzhou	0	26	62	0	0	0	88
Lhasa	56	32	0	0	0	0	88
Nanchang	0	28	50	10	0	0	88
Nanjing	0	42	39	4	0	0	85
Nanning	10	52	22	4	0	0	88
Shanghai	5	49	29	3	1	1	88
Shenyang	1	28	33	24	2	0	88
Shenzhen	2	67	17	2	0	0	88
Shijiazhuang	0	21	37	24	6	0	88
Suzhou	0	33	43	12	0	0	88
Taiyuan	2	29	32	20	5	0	88
Tianjin	9	29	27	21	2	0	88
Wuhan	1	20	55	12	0	0	88
Wulumuqi	3	46	22	11	4	2	88
Xian	0	19	44	19	6	0	88
Xining	0	28	58	2	0	0	88
Yinchuan	0	57	24	7	0	0	88
Zhengzhou	2	18	47	19	2	0	88
Total	153	1045	1081	306	44	9	2638

chisq.test(airquality_level, simulate.p.value = TRUE)

## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  airquality_level
## X-squared = 1363.2, df = NA, p-value = 0.0004998

qchisq(0.05, 174, lower.tail = TRUE)

## [1] 144.494

Chi-Squared Test Result

For a Chi-square test, the p-value(0.0004998) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the air quality level and the different cities variables of the contingency table are independent and there is a relationship between them.

Two-Sample Paired T-test

Null hypothesis (H0): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero.

Alternative hypothesis (H1): The true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is not equal to zero.

mean_20 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean = mean(pm25)) %>%
  select(mean) %>%
  pull()


mean_171819 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean = mean(pm25)) %>%
  select(mean) %>%
  pull()

table1 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean_2020 = mean(pm25))

table2 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(mean_2017_2018_2019 = mean(pm25)) %>%
  select(-city) 

table3 = pm25_2020 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(standard_deviation_2020 = sd(pm25))

table4 = pm25_171819 %>%
  drop_na() %>% 
  select(-date) %>%
  group_by(city) %>%
  summarise(standard_deviation_2017_2018_2019 = sd(pm25)) %>%
  select(-city)

T Test Table

Mean Table

knitr::kable(bind_cols(table1,table2))

city	mean_2020	mean_2017_2018_2019
Beijing	101.08889	122.56180
Changchun	129.68182	130.83146
Changsha	119.76136	131.89139
Chengdu	120.07955	138.58052
Chongqing	118.37500	122.62547
Fuzhou	85.51136	101.99625
Guangzhou	79.71591	104.67416
Guiyang	104.68182	119.70037
Harbin	118.10227	128.25468
Hefei	99.15909	133.51311
Jinan	121.12500	153.87640
Kunming	93.87500	107.58801
Lanzhou	109.44318	127.83895
Lhasa	47.64773	70.38202
Nanchang	117.04545	122.91011
Nanjing	103.63529	131.98876
Nanning	89.17045	102.31461
Shanghai	96.43182	121.94382
Shenyang	124.36364	131.01124
Shenzhen	86.70455	108.02622
Shijiazhuang	132.56818	170.88015
Suzhou	112.28409	138.64419
Taiyuan	123.22727	141.02622
Tianjin	112.10227	131.56015
Wuhan	119.30682	148.14286
Wulumuqi	111.55682	153.50562
Xian	130.88636	156.21348
Xining	112.95455	125.71536
Yinchuan	95.00000	113.98876
Zhengzhou	125.65909	153.83895

Standard Deviation Table

knitr::kable(bind_cols(table3,table4))

city	standard_deviation_2020	standard_deviation_2017_2018_2019
Beijing	53.622410	58.27449
Changchun	71.154930	41.89880
Changsha	31.946433	35.77939
Chengdu	31.925280	31.39782
Chongqing	27.455842	32.98602
Fuzhou	26.220547	31.28411
Guangzhou	30.560266	32.54474
Guiyang	25.447510	27.04869
Harbin	82.209578	51.92377
Hefei	26.941779	29.62705
Jinan	33.247872	35.76158
Kunming	25.200717	27.87814
Lanzhou	17.747823	25.46043
Lhasa	9.629982	20.77276
Nanchang	27.839404	33.82973
Nanjing	25.560780	29.50181
Nanning	33.432802	30.92044
Shanghai	43.508170	38.68478
Shenyang	43.895430	41.74101
Shenzhen	22.903676	26.22566
Shijiazhuang	45.105444	66.73903
Suzhou	29.967396	33.18705
Taiyuan	45.217159	45.75691
Tianjin	45.379888	55.96237
Wuhan	28.383806	30.04290
Wulumuqi	55.372689	76.66299
Xian	37.331516	49.79993
Xining	19.740215	29.23179
Yinchuan	30.217982	33.16596
Zhengzhou	34.516325	46.23868

t.test(mean_20, mean_171819, paired = T)

## 
##  Paired t-test
## 
## data:  mean_20 and mean_171819
## t = -11.116, df = 29, p-value = 5.679e-12
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -23.87243 -16.45294
## sample estimates:
## mean of the differences 
##               -20.16268

qt(0.05, 29)

## [1] -1.699127

T Test Result

For a Two-Sample Paired T-test, the p-value(5.679e-12) that is less than 0.05 significance level. We can reject the null hypothesis (H0) and indicates there is no evidence to conclude that the true mean difference between the average number of AQI (pm25) for each 30 cities from February to April 2020 and the average number between 2017-2019 is equal to zero and there are different mean for each 30 cities between 2020 and 2017-2019.

Hypothesis Test