COVID: Symptom

Brief
Data
Preparation
Data clean
Population characteristics
Symptom profile
Symptom cluster
Syndrome region
Symptom region
Association analysis
R session info
Comments and feedbacks

Brief

In Dec 2022, COVID-19 restrictions in China were lifted, and a new wave of the pandemic emerged.
We conducted a survey to investigate the symptoms and their association with population characteristics(e.g., vaccination).
Published in Journal of Medical Virology. [paper]

Data

552 questionnaires were collected. Note: the data is in Chinese and it would be cleaned with the below code.
You can download the data at here.
Or, you may clone our repository, which contains our data and results:

git clone https://github.com/zhanghaoyang0/covid_survey.git
cd covid_survey

Preparation

Load packages:

libs = c('openxlsx', 'stringr', 'dplyr', 'stringi', 'R.utils', 'ggplot2', 'ggpubr', 'ggsci', 'mapchina', 'sf', 'data.table','NbClust', 'igraph', 'factoextra')
lapply(libs, require, character.only = TRUE)
options(stringsAsFactors=F)
sf::sf_use_s2(FALSE)

Define functions:

# calculate proportion of a given var for all, male, female participants
# e.g,
# temp <- data.frame(sex=c('Male', 'Female'), var=c(1, 0))
# get_prop(temp, 'var')
get_prop <- function(df, var){
    for (sex1 in c('Male|Female', 'Male', 'Female')){
      df1 <- df%>%filter(grepl(sex1, sex))
      tab <- table(df1[,var])
      frq <- data.frame(var=names(tab), n=as.numeric(tab))
      frq <- frq%>%mutate(n=paste0(n,'(', sprintf('%.2f', n*100/sum(tab)),'%)'))
      print(paste0('distribution of ', ifelse(sex1=='Male|Female', 'all', tolower(sex1)), ' participants in ', var, ':'))
      print(frq)
    }
}
# convert character vector to numeric
# e.g, df <- df%>%mutate_if(is_numeric,as.numeric)
is_numeric <- function(x) {
    !any(is.na(suppressWarnings(as.numeric(na.omit(x))))) & is.character(x)
}

Versions I used

Data clean

Load data, clean colnames:

df <- read.xlsx('data/covid_survey_20230112.xlsx', sheet=1) # questionaire
names(df) <- gsub('/|，|？|“|”', '', names(df)) # remove Chinese punctuations
names(df) <- gsub('?', '', names(df), fixed=T)
names(df) <- str_replace(names(df), '在感染后是否有出现以下[\U4E00-\U9FFF\U3000-\U303F]+症状:', '')
names(df) <- str_replace(names(df), '[（][\U4E00-\U9FFF\U3000-\U303F|1-9]+[）]', '')

Drop unused items and atypical symptoms:

drop_cols <- c('提交时间', '答题时间', '喉咙有刀割感', '吞咽时疼痛', '喉咙嘶哑', '喉咙干痒', '性欲减退', '生理期异常', '肾脏部位疼痛', '流泪', '打喷嚏')
df[,drop_cols] <- NULL # drop atypical symptoms
names(df)

Replace chinese with english:

dict1 <- read.xlsx('data/covid_survey_20230112.xlsx', sheet=2)
dict2 <- read.xlsx('data/covid_survey_20230112.xlsx', sheet=3)
dict <- rbind(dict1, dict2[,c(1,4)])
print('chinese items to english:')
print(head(dict))
for (i in 1:nrow(dict)){
    names(df)[names(df)==dict[i, 1]] = dict[i, 2]
}

Characteristics description

Age and sex:

df <- df%>%mutate(age=gsub('岁|以上', '', age))%>%
  mutate(age=ifelse(age%in%c('41-50', '51-60', '61-70', '75'), '>40', age))%>%
  mutate(age=ifelse(age%in%c('12-18',  '18-24', '6-12', '3-6'), '<24', age))%>%
  mutate(age=factor(age, levels=c('<24', '24-30', '31-40', '>40')))
df <- df%>%mutate(sex=factor(ifelse(sex=='女','Female', 'Male'), levels=c('Female', 'Male')))
table(df$sex)
get_prop(df, 'age')

Disease duration:

# assume interval is left closed and right open, modify to reduce confusing
df$infect_duration[df$infect_duration=='3～5天'] <- '3~4天'
df$infect_duration[df$infect_duration=='5~7天'] <- '5~6天'
df$infect_duration[df$infect_duration=='7~10天'] <- '7~9天'
df <- df%>%mutate(infect_duration=ifelse(infect_duration%in%c('7~9天', '10天以上'), '>7 day', infect_duration))%>%
  mutate(infect_duration=ifelse(infect_duration%in%c('', '小于3天'), '<3 day', infect_duration))%>%
  mutate(infect_duration=gsub('天', ' day', infect_duration))%>%
  mutate(infect_duration=gsub('~', '-', infect_duration))%>%
  mutate(infect_duration=factor(infect_duration, levels=c('<3 day', '3-4 day', '5-6 day', '>7 day')))
get_prop(df, 'infect_duration')

Fever duration:

df$fever_duration <- sapply(df$fever_duration, function(x){strsplit(x, '[(]')[[1]][1]})
df <- df%>%mutate(fever_duration=ifelse(is.na(fever_duration), 'no reply', fever_duration))%>%
  mutate(fever_duration=gsub('天', ' day', fever_duration))%>%
  mutate(fever_duration=ifelse(fever_duration%in%c('1 day', '<1 day'), '≤1 day', fever_duration))%>%
  mutate(fever_duration=factor(fever_duration, levels=c('no reply', '≤1 day', '2 day', '3 day', '>3 day')))
get_prop(df, 'fever_duration')

Infect route:

df <- df%>%mutate(
  infectway_entertainment=factor(as.numeric(grepl('消费场所', infect_way))),
  infectway_work=factor(as.numeric(grepl('工作场所', infect_way))),
  infectway_family=factor(as.numeric(grepl('在家被家人传染', infect_way))),
  infectway_traffic=factor(as.numeric(grepl('公共交通', infect_way))),
  infectway_hosp=factor(as.numeric(grepl('医疗场所', infect_way))))
for (i in c('infectway_entertainment', 'infectway_work', 'infectway_family', 'infectway_traffic', 'infectway_hosp')){
  print(i)
  get_prop(df, i)
}

Vaccination:

df[df$how_long_lastvac=='', 'n_vac'] <- '0' # if a person report n_vac but not how_long_lastvac, treat n_vac as NA
df <- df%>%mutate(n_vac=ifelse(n_vac%in%c(3, 4), '≥3', n_vac))%>%
  mutate(n_vac=factor(n_vac, levels=c('0', '1', '2', '≥3')))
df <- df%>%mutate(how_long_lastvac=ifelse(how_long_lastvac=='', 'no_vac', how_long_lastvac))%>%
  mutate(how_long_lastvac=gsub('个月', ' month', how_long_lastvac))%>%
  mutate(how_long_lastvac=ifelse(how_long_lastvac%in%c('<3 month', '3-6 month'), '<6 month', how_long_lastvac))%>%
  mutate(how_long_lastvac=factor(how_long_lastvac, levels=c('no_vac', '<6 month', '6-12 month', '>12 month')))
get_prop(df, 'n_vac')
get_prop(df, 'how_long_lastvac')

Medication:

drugs = c('ibuprofen_use', 'acetaminophen_use', 'chnmed_use')
df[, drugs][is.na(df[, drugs])] = 0
for (drug in drugs){
   print(drug)
   get_prop(df, drug)
   df[,drug] = as.factor(df[,drug])
}

Syndrome:

# calculate syndrome score with symptom score, normalize to 0-1:
syndromes <- unique(dict2$syndrome)
symptoms = list()
for (i in syndromes){
  symptoms[[i]] = dict2%>%filter(syndrome==i)%>%pull(item_eng)
}
print(symptoms)
print(sum(is.na(df[,unlist(symptoms)]))) # number of NA
for (syndrome in syndromes){
  score = rowSums(df[,symptoms[[syndrome]]])/length(symptoms[[syndrome]])/3 # normalize to 0-1
  df[,paste0(syndrome, '_score')] = score
}

Region:

df$region <- gsub("维吾尔|壮族", "", df$region)
regions <- c()
for (i in 1:nrow(df)) {
  item <- df[i, "region"]
  item1 <- strsplit(item, "自治区|自治州|特别行政区|省|市")[[1]][1]
  regions <- c(regions, item1)
}
df$region <- regions
# replace chn with pinyin
data(china)
china <- china %>% mutate(region = gsub("省|市|回族|壮族|维吾尔|特别行政区|自治区", "", Name_Province))
china <- china %>%
  group_by(region) %>%
  dplyr::summarise(geometry = st_union(geometry))
replace <- data.frame(
  region = china$region,
  region1 = c(
    "Shanghai", "Yunnan", "Neimenggu", "Beijing", "Taiwan", "Jilin", "Sichuan", "Tianji",
    "Ningxia", "Anhui", "Shandong", "Shanxi", "Guangdong", "Guangxi", "Xinjiang", "Jiangsu", "Jiangxi", "Hebei",
    "Henan", "Zhejiang", "Hainan", "Hubei", "Hunan", "Macau", "Gansu", "Fujian", "Tibet", "Guizhou", "Liaoning",
    "Chongqing", "Shaanxi", "Qinghai", "Hong Kong", "Heilongjiang"
  )
)
china <- china %>%
  merge(replace, "region") %>%
  select(-region) %>%
  rename(region = region1)
df <- df %>%
  merge(replace, "region") %>%
  select(-region) %>%
  rename(region = region1)
print(table(df$region))
# sample size for each region
tab <- table(df$region)
tab <- data.frame(cbind(names(tab), tab))
pop_tab <- tab %>%
  rename(n = tab, region = V1) %>%
  mutate(n = as.numeric(n)) %>%
  arrange(n)
print(pop_tab)

Symptom profile

map <- data.frame(score = c(0:3), score1 = c("Absent", "Mild", "Moderate", "Severe"))
out <- c()
for (i in unlist(symptoms)) {
  for (j in c(0:3)) {
    prop <- sum(df[, i] == j) / nrow(df)
    out <- c(out, i, j, prop)
  }
}
df_p <- data.frame(matrix(out, ncol = 3, byrow = T))
df_p <- df_p %>%
  mutate_if(is_numeric, as.numeric) %>%
  rename(symptom = X1, score = X2, prop = X3)
df_p <- df_p %>%
  merge(map, "score") %>%
  mutate(score1 = factor(score1, levels = c("Severe", "Moderate", "Mild", "Absent")))
df_p %>%
  merge(dict2, by.x = "symptom", by.y = "item_eng") %>%
  select(syndrome, symptom, score1, prop) # add syndrome
plots <- list()
for (syndrome in syndromes) {
  df_p1 <- df_p %>% filter(symptom %in% symptoms[[syndrome]])
  df_p1 <- df_p1 %>% mutate(symptom = gsub(paste0(syndrome, "_"), "", symptom))
  xlevels <- df_p1 %>%
    filter(score1 == "Absent") %>%
    arrange(prop) %>%
    pull(symptom)
  df_p1$symptom <- factor(df_p1$symptom, levels = xlevels)
  df_p1 <- df_p1 %>%
    rename(item_eng = symptom) %>%
    merge(dict2, "item_eng") %>%
    rename(symptom = item_eng1) # replace symptoms with their formal names
  p <- ggplot(df_p1, aes(x = symptom, weight = prop, fill = score1)) +
    geom_bar(position = "stack") +
    xlab("") +
    ylab("") +
    labs(fill = "Severity") +
    theme(
      axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, color = "black"),
      axis.text.y = element_text(color = "black"), legend.position = "none"
    ) +
    ggtitle(syndrome) +
    theme(plot.title = element_text(size = 15, face = "bold", hjust = 0.5)) +
    coord_flip() +
    scale_fill_nejm()
  plots[[syndrome]] <- p
}
# multiple plot
p1 <- ggarrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]],
  ncol = 2, nrow = 2, common.legend = T, legend = "bottom",
  hjust = 0.1, vjust = 0.1
)
file_out <- "./plot/bar.png"
png(file_out, height = 1000, width = 1000, res = 160)
print(p1)
dev.off()

Symptom cluster

df_p = t(df[, unlist(symptoms)])
# repalce symptoms with their formal names
for (i in 1:nrow(dict2)){
   row.names(df_p)[row.names(df_p)==dict2[i, 'item_eng']] = dict2[i, 'item_eng1']
}
d = dist(df_p)
fit = hclust(d, method = "average")
png(paste0('hclust.png'), width=2500, height=1500, res=300)
fviz_dend(fit, k=3, rect =F, rect_fill = T, palette='aaas', cex = 0.6,
   type = c("rectangle"), # type = c("rectangle", "circular", "phylogenic")
   main = '', ylab = "Dendrogram height", horiz = T)  # ggsci color
dev.off()

Syndrome regional distribution

# mean score
res = data.frame()
plots = list()
for (group in syndromes){
   print(group)
   temp = df[, c('region', paste0(group, '_score'))]
   names(temp)[2] = 'score'
   temp = temp%>%group_by(region)%>%dplyr::summarise(score=mean(score))
   temp = temp%>%merge(pop_tab, 'region')%>%filter(n>=10)%>%arrange(desc(score))
   sub = data.frame(temp)%>%mutate(group=group)
   res = rbind(res, sub)
   map1 = china%>%merge(temp, by='region', all.x=T)%>%mutate(region=ifelse(is.na(score), NA, region)) # add to map
   p = ggplot(data = map1) +
      geom_sf(aes(fill = score)) +
      geom_sf_text(aes(label = region), colour = "black") +
      scale_fill_distiller(palette = "Spectral") +
      labs(fill = 'Score') +
      ggtitle(group) +
      theme(plot.title = element_text(size = 35, face = "bold", hjust=0.07, vjust=-9),
            plot.background = element_blank(), panel.border = element_blank(),
            axis.text.x=element_blank(), axis.ticks.x=element_blank(),
            axis.text.y=element_blank(), axis.ticks.y=element_blank(),
            legend.key.height= unit(1.5, 'cm'), legend.key.width= unit(1.5, 'cm'),
            legend.title = element_text(size=20), legend.text = element_text(size=15),
            panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank()) +
      labs(x = "", y = '') +
      coord_sf(xlim = c(73, 135), ylim = c(18, 54), expand = T)
   plots[[group]] = p
}
p1 = ggarrange(plots[[1]], plots[[2]], plots[[3]], plots[[4]], ncol=2, nrow=2,
   common.legend=T, legend="right")
png('./plot/map.png', height=1300, width=1700, res=80)
print(p1)
dev.off()
res # average score

Symptom regional distribution

res = data.frame()
provs =  names(rev(sort(table(df$region)))) # sort by n
for (prov in provs){
   sub = df%>%filter(region==prov)
   temp = colMeans(sub[,unlist(symptoms)])/3
   add = data.frame(region=prov, symptom=names(temp), score=temp, n=nrow(sub))
   res = rbind(res, add)
}
# replace symptom names with their formal name
res = res%>%merge(dict2%>%select(item_eng, item_eng1), by.x='symptom', by.y='item_eng')%>%select(-symptom)%>%rename(symptom=item_eng1)
# filter region with less than 10 samples
res = res%>%filter(n>=10)
p = res%>%
   ggplot(aes(region, symptom, fill=score)) +
   geom_tile() +
   labs(x = NULL, y = NULL, fill = "Score", title="", subtitle="") +
   scale_fill_gradient2(limits=c(0,1)) +
   theme_classic() +
   theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1, color="black"),
      axis.text.y = element_text(color="black"),
      legend.title = element_text(size=12)) +
   scale_fill_distiller(palette = "Spectral")

png('./plot/heatmap.png', height=1000, width=800, res=150)
print(p)
dev.off()

Association analysis

# we use linear regression to test if syndrome score varired in different population
# we use logistic regression to test if symptom varired in different population
# in each regression, we add age and sex as covariate
covars <- c("age", "sex")
test_vars <- c(
  "agesex", "how_long_lastvac", "n_vac",
  "infectway_work", "infectway_family", "infectway_traffic", "infectway_hosp",
  "ibuprofen_use", "acetaminophen_use", "chnmed_use", "lianhua_use"
) # the first mean we only include age and sex in reg
outcomes <- c(paste0(c("Respiratory", "Neurological", "Digestive", "Other"), "_score"), unlist(symptoms))
out <- c()
for (test_var in test_vars) {
  if (test_var == "agesex") {
    formula <- formula("y~age+sex")
  } else {
    formula <- formula(paste0("y~age+sex+", test_var))
  }
  # when var is how_long_lastvac, remove samples without vac
  if (test_var == "how_long_lastvac") {
    df1 <- droplevels(df %>% filter(how_long_lastvac != "no_vac")) # drop level
  } else {
    df1 <- df
  }
  for (outcome in outcomes) {
    print(outcome)
    # regression
    df1$y <- df1[, outcome]
    link <- "gaussian" # for linear reg
    if (!grepl("_score", outcome)) {
      df1$y <- ifelse(df1$y == 0, 0, 1) # binary outcome, logistic reg
      link <- "binomial" # for logistic reg
    }
    reg <- glm(formula, df1, family = link)
    coef <- data.frame(summary(reg)$coefficients) %>% tibble::rownames_to_column("var")
    coef <- coef[2:nrow(coef), c(1, 2, 3, 5)]
    names(coef) <- c("var", "beta", "se", "p")
    # calculate distribution in sub population, n(%) or mean±sd
    reg_vars <- strsplit(as.character(formula)[3], " + ", fixed = T)[[1]]
    if (test_var != "agesex") {
      reg_vars <- reg_vars[!reg_vars %in% c("age", "sex")]
    } # we dont show coef of age and sex when testing other var
    for (reg_var in reg_vars) {
      for (group in levels(df1[, reg_var])) {
        temp <- df1 %>%
          filter(df1[, reg_var] == group) %>%
          pull(y)
        coef1 <- coef[coef$var == paste0(reg_var, group), c("beta", "se", "p")]
        if (nrow(coef1) == 0) {
          coef1 <- c("NA", "NA", "NA")
        }
        if (group == levels(df1[, reg_var])[1]) {
          coef1 <- c("Ref.", "NA", "NA")
        }
        d1 <- d2 <- d3 <- NA
        if (!grepl("_score", outcome)) {
          d1 <- paste0(sum(temp == 1), " (", round(sum(temp == 1) / length(temp) * 100, 2), "%)")
          d2 <- paste0(sum(temp == 0), " (", round(sum(temp == 0) / length(temp) * 100, 2), "%)")
        } else {
          d3 <- paste0(sprintf("%.2f", mean(temp)), "±", sprintf("%.2f", sd(temp)))
        }
        # formal name
        if (grepl("score", outcome)) {
          outcome1 <- gsub("_", " ", outcome)
        } else {
          outcome1 <- dict2 %>%
            filter(item_eng == outcome) %>%
            pull(item_eng1)
        }
        out <- c(out, outcome1, test_var, reg_var, group, d1, d2, d3, unlist(coef1))
      }
    }
  }
}
res <- data.frame(matrix(out, ncol = 10, byrow = T))
names(res) <- c("outcome", "test_var", "reg_var", "level", "ncase", "nctrl", "mean", "beta", "se", "p")
# extract significant result
res1 <- data.frame()
for (test_var1 in test_vars) {
  for (outcome1 in unique(res$outcome)) {
    sub <- res %>% filter(test_var == test_var1 & outcome == outcome1)
    if (!any(as.numeric(sub %>% filter(p != "NA") %>% pull(p)) < 0.05)) {
      next
    }
    res1 <- rbind(res1, sub)
  }
}
res1

R session info

R (and packages) versions I used (not necessary to be the same, but you may check if you have error).

# R version I used (not necessary to be the same, but check if you have error)
session = c(unname(unlist(R.version['version.string'])))
for (lib in libs) {session = c(session, sprintf('%s: %s', lib, packageVersion(lib)))}
paste(session, collapse='; ')
# "R version 4.2.3 (2023-03-15); openxlsx: 4.2.5.2; stringr: 1.5.1; dplyr: 1.1.4; stringi: 1.7.12; R.utils: 2.12.3; ggplot2: 3.4.4; ggpubr: 0.6.0; ggsci: 3.0.0; mapchina: 0.1.0; sf: 1.0.12; data.table: 1.14.8; NbClust: 3.0.1; igraph: 1.4.2; factoextra: 1.0.7"

Comments and feedbacks

Find me via [email protected]

Haoyang Zhang

Histogram