Jinqiao cohort
Table of Contents
Brief
In 2022, Jiangnan University initialized a children cohort in China, Wuxi, namely Jinqiao cohort.We investigated potential novel association between MAFLD and certain lesser-explored biomarkers.
Here is the analysis pipeline.
Data
Contact me ([email protected]) or Dr. Le Zhang ([email protected]) to access the data.Data clean
Load packages and define functions:libs <- c("dplyr", "tidyr", "ggplot2", "ggpubr", "ggsci", "psych", "ggcorrplot", "corrplot", "pROC", "plotROC") lapply(libs, require, character.only = TRUE) options(stringsAsFactors = F) ## extract element with pattern from vector # e.g., get('a', c('AS', 'Ba', 'C')) ; get('a', c('AS', 'Ba', 'C'), exact=F) get <- function(key, vec, exact = F, v = F) { vec <- unlist(vec) if (exact == T) { out <- vec[grep(key, vec, invert = v)] } else { out <- vec[grep(toupper(key), toupper(vec), invert = v)] } return(out) } # calculate proportion of a given var for all, male, female participants # e.g, # temp = data.frame(sex=c('Male', 'Female'), MAFLD=c(1, 0)) # get_prop(temp, 'sex', 'MAFLD') get_prop <- function(df, col, group_col) { for (group in c("0|1", "1", "0")) { sub <- df[grepl(group, df[, group_col]), ] tab <- table(sub[, col]) frq <- data.frame(col = names(tab), n = as.numeric(tab)) frq <- frq %>% mutate(n = paste0(n, "(", sprintf("%.2f", n * 100 / sum(tab)), "%)")) group1 <- case_when(group == "0|1" ~ "all", group == "1" ~ group_col, group == "0" ~ paste0("non-", group_col)) print(paste0("distribution of ", group1, " participants :")) print(frq) } }Clean the data (remove redundancy variables, and recodes the variables, etc):
# load df load("path_of_data") # inbody, drop right df <- df[, !grepl("FFM%|.Right", names(df))] names(df) <- gsub(".", "_", names(df), fixed = T) names(df) <- gsub("_of_", "_", names(df), fixed = T) names(df) <- gsub("Left_", "", names(df), fixed = T) names(df) <- gsub("Circumference", "C", names(df), fixed = T) names(df) <- gsub("Muscle_C", "MC", names(df)) names(df) = gsub("Fat_Thickness", "FT", names(df)) df = df %>% mutate(BMI = as.numeric(weight) / ((as.numeric(height) / 100)^2), sex = ifelse(sex == "男", "m", "f"), Month = round(age * 12)) %>% rename(Age = age, Sex = sex) ## recode # bmi, orinal values according to who criteria vars <- names(df)[which(names(df) == "SBP"):which(names(df) == "PDW")] # all vars cat_vars <- c('Sex', "ERY", "URO", "PRO", "LEU", "VC", "UCA") # categorical num_vars <- c('Age', vars[!vars %in% cat_vars]) # numeric diagnosis_vars = c('BMI', 'SBP', 'DBP', 'GLU', 'TG', 'HDLC', 'C_Abdomen') # biomarkers used in diagnosed ## recode # cat_vars, convert to orinal values for (col in c("ERY", "URO", "PRO", "LEU", "VC", "UCA")) { print(col) cat("before recode:\n") print(table(df[, col])) var = df[, col] if (col == "UCA") { var1 = case_when(var == "<1.00" ~ 0, var == "2.5" ~ 1, var == "5" ~ 2) } else if (col == "URO") { var1 = case_when(var == "阴性" ~ 0, var == "阳性+" ~ 1, var == "阳性++" ~ 2) } else { var1 = case_when(var == "阴性" ~ 0, var == "弱阳性" ~ 1, var == "阳性+" ~ 2, var == "阳性++" ~ 3, var == "阳性+++" ~ 4) } df[, col] = var1 cat("after recode:\n") print(table(df[, col])) }
Characteristics description
Here we measure the distribution of categorical biomarkers by proportion and that of numeric ones by mean and sd.Distribution comparison was conducted according to data type.
## distribution description # categorical biomarkers, n and proportion for (col in c("Sex", "BMI", cat_vars)) { print(col) get_prop(df, col, group_col = "MAFLD") } # numeric biomarkers, mean, sd, iqr describe(df[, num_vars]) describeBy(df[, num_vars], list(df$MAFLD)) ## distribution comparison # fisher test for (var in cat_vars) { print(var) print(fisher.test(table(df[, var], df$MAFLD), simulate.p.value = TRUE)) # here change to fisher test } # wilcox test for (var in num_vars) { print(var) shapiro = shapiro.test(df[, var]) if (shapiro$p.value < 0.05) { test = wilcox.test(df[, var] ~ df$MAFLD) } else { test = t.test(df[, var] ~ df$MAFLD) } print(test) }
Alternative biomarkers
Here we measured the association between diagnosis biomarker and other biomarker.If a biomarker is associated with diagnosis biomarker, it is an alternative biomarker but not a novel biomarker.
out = c() for (i in diagnosis_vars){ for (j in vars[vars!=i]){ test = cor.test(df[,i], df[,j], use = "complete.obs") cor = test$estimate cor_p = test$p.value out = c(out, i, j, cor, cor_p) } } res = data.frame(matrix(out, ncol=4, byrow=T)) names(res) = c('diagnosis_var', 'other_var', 'cor', 'p') res = res%>%mutate(cor=as.numeric(cor), p=as.numeric(p)) res1 = res%>%filter(p<0.05&abs(cor)>0.8) plots <- list() for (var in unique(res1$diagnosis_var)){ df_p = res%>%filter(diagnosis_var==!!var)%>%arrange(cor)%>%filter(p<0.05&cor>0.2)%>% mutate(cor=abs(cor), diagnosis_var=gsub('_', ' of ', diagnosis_var), other_var=gsub('_', ' of ', other_var)) df_p = df_p%>%mutate(other_var=factor(other_var, levels=df_p$other_var)) ylab = paste0('Correlation coefficient with ', ifelse(var=='BMI', 'BMI', 'abdomen circumference')) p = ggplot(df_p, aes(x=other_var, y=cor))+ geom_bar(stat="identity", width=0.7, fill="steelblue") + xlab('') + ylab(ylab) + theme(plot.title = element_text(size = 15, face = "bold", hjust = 0.5)) + coord_flip() + geom_hline(yintercept = 0.8, color = "black", linetype = 2) + scale_fill_manual(values = c("steelblue", "green"), breaks = c(0, 1), labels = c("Correlation < 0.8", "Correlation >= 0.8")) + scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, 0.2)) plots[[var]] = p } png("plot/cor1.png", height = 800, width = 1200, res = 100) ggarrange(plots[[1]], plots[[2]], nrow = 1, ncol = 2, hjust = 0.1, vjust = 0.1) dev.off() # diagnosis_vars and vars associated with diagnosis_vars drop_vars = unlist(res1[,1:2])After excluding the above biomarkers (associated with diagnosis biomarker),
we investigate pair-wise correlation witnin remainings:
vars = list() vars[['inbody']] = c(names(df)[which(names(df) == "BFM"):which(names(df) == "FT_Thigh")]) # 31 vars[['blood_biochemical']] = c(names(df)[which(names(df) == "INS"):which(names(df) == "UREA/CREA")]) # 27 vars[['blood_composition']] = c(names(df)[which(names(df) == "WBC"):which(names(df) == "PDW")]) # 22 vars[['urine']] = c("URBC", "UWBC", "UPRO", "UPCR", "UCREA", "SG", "PH", "EC", "MUCS") # 9 plots <- list() for (i in names(vars)) { keep_col <- vars[[i]] keep_col = keep_col[!keep_col%in%drop_vars] print(i) print(keep_col) sub <- df[, keep_col] mat_cor <- cor(sub) mat_p <- corr.test(sub, adjust = "none")[["p"]] p <- ggcorrplot(mat_cor, p.mat = mat_p, type = "lower", hc.order = T, insig = "blank", outline.col = "white", ggtheme = ggplot2::theme_gray) + theme(axis.text.x = element_text(angle = 90, hjust = 1), legend.position='none') plots[[i]] <- p } png("plot/cor_inbody.png", height = 800, width = 1800, res = 100) ggarrange(plots[[1]], nrow = 1, ncol = 1, hjust = 0.1, vjust = 0.1, common.legend = T, legend = "bottom") dev.off() png("plot/cor2.png", height = 800, width = 1800, res = 100) ggarrange(plots[[2]], plots[[3]], plots[[4]], nrow = 1, ncol = 3, hjust = 0.1, vjust = 0.1, common.legend = T, legend = "bottom" ) dev.off()
Novel biomarkers
Here we identify novel biomarkers with remainnings in previous step.First, for each biomarker, we measure its association with MAFLD, including age and sex as covariates.
Then, for those with significant p-value in univariate analysis, we perform multivariates analysis for variable selection.
biomakers = unlist(vars) biomakers = biomakers[!biomakers%in%c(drop_vars, diagnosis_vars)] res <- data.frame() for (biomaker in biomakers) { reg <- glm(df$MAFLD ~ df[, biomaker] + df$Age + df$Sex, df, family = binomial()) # I add age and sex here. coef <- data.frame(summary(reg)$coefficients) coef <- coef[2, c(1, 2, 4)] coef <- c(biomaker, coef) names(coef) <- c("biomarker", "beta", "se", "p") res <- rbind(res, coef) } vars <- unname(unlist(res %>% filter(p < 0.05) %>% select(biomarker))) sub <- df[, c("MAFLD", vars)] reg <- glm(MAFLD ~ ., family = binomial(), data = sub) summary(reg) reg1 <- step(reg) coef1 <- data.frame(summary(reg1)$coefficients) coef1 <- coef1[2:nrow(coef1), c(1, 2, 4)] coef1 <- cbind(rownames(coef1), coef1) names(coef1) <- c("biomarker", "beta", "se", "p") row.names(coef1) <- NULL # density plot vars <- names(reg1$coefficients)[-1] vars <- gsub("`", "", vars) df_p <- df[, c(vars, "MAFLD")] df_p <- df_p %>% gather(variable, value, -MAFLD) %>% mutate(MAFLD = as.character(MAFLD)) p <- ggplot(df_p, aes(x = value, group = MAFLD, fill = MAFLD)) + geom_density(alpha = 0.5, , adjust = 0.3) + facet_wrap(~variable, scales = "free") + scale_y_continuous(labels = function(x) sprintf("%.1f", x)) + xlab("") + ylab("") + theme( legend.position = c(0.9, 0.1), legend.box = "inside" ) png("plot/density.png", height = 1000, width = 2000, res = 160) print(p) dev.off()