Loading data sets and merging and cleaning

## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode

calculating bmi

combined_data_clean$DM21_01 <- as.numeric(as.character(combined_data_clean$DM21_01))
combined_data_clean$DM21_02 <- as.numeric(as.character(combined_data_clean$DM21_02))
combined_data_clean$DM21_03 <- as.numeric(as.character(combined_data_clean$DM21_03))


# Calculate BMI before (based on DM21_01 - weight before and DM21_03 - height)
combined_data_clean$BMI_before <- combined_data_clean$DM21_01 / (combined_data_clean$DM21_03 / 100)^2

# Calculate BMI after (based on DM21_02 - weight after and DM21_03 - height)
combined_data_clean$BMI_after <- combined_data_clean$DM21_02 / (combined_data_clean$DM21_03 / 100)^2


combined_data_clean <- combined_data_clean[!is.na(combined_data_clean$BMI_after), ]


combined_data_clean <- combined_data_clean[combined_data_clean$DM21_03 > 70, ]

calculating PROM scores

IMET

# Variables of interest
variables_to_convert <- paste0("ME01_0", 1:8)

# Convert to numeric, replace -9 with NA, and recode values from 1–11 to 0–10
combined_data_clean[variables_to_convert] <- combined_data_clean[variables_to_convert] |>
  lapply(function(column) {
    numeric_column <- as.numeric(column)  # Convert to numeric
    numeric_column[numeric_column == -9] <- NA  # Replace -9 with NA
    numeric_column <- numeric_column - 1  # Recode values: 1–11 to 0–10
    return(numeric_column)
  }) |>
  as.data.frame()  # Convert back to data frame





# Create a new variable by summing the values of the selected columns,
# setting NA for participants with any missing values in the selected variables
combined_data_clean$ME01_sum <- apply(combined_data_clean[variables_to_convert], 1, function(row) {
  if (any(is.na(row))) {
    return(NA)  # Set NA if any variable is missing
  } else {
    return(sum(row))  # Calculate the sum if all variables have values
  }
})



combined_data_clean <- combined_data_clean %>%
  mutate(
    ME02_0_new = ifelse(ME02_01 == -9, NA, as.numeric(ME02_01) - 1)
  )





combined_data_clean$IMET_total <- ifelse(is.na(combined_data_clean$ME02_0_new) | is.na(combined_data_clean$ME01_sum), NA, combined_data_clean$ME02_0_new + combined_data_clean$ME01_sum)

FAS

# Step 1: Dynamically find columns matching "FA01_01" to "FA01_10"
variables_to_convert2 <- grep("^FA01_0[1-9]$|^FA01_10$", names(combined_data_clean), value = TRUE)

# Step 2: Convert to numeric and replace -9 with NA
combined_data_clean[variables_to_convert2] <- as.data.frame(
  lapply(combined_data_clean[variables_to_convert2], function(column) {
    numeric_column <- as.numeric(as.character(column))  # Convert to numeric
    numeric_column[numeric_column == -9] <- NA          # Replace -9 with NA
    return(numeric_column)
  })
)

# Step 3: Verify the changes
 # Check the selected variables
 # Confirm numeric conversion



# Step 3: Recode scores for questions 4 and 10
combined_data_clean$FA01_04 <- dplyr::recode(
  combined_data_clean$FA01_04,
  `1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1,
  .default = combined_data_clean$FA01_04
)
combined_data_clean$FA01_10 <- dplyr::recode(
  combined_data_clean$FA01_10,
  `1` = 5, `2` = 4, `3` = 3, `4` = 2, `5` = 1,
  .default = combined_data_clean$FA01_10
)


# Step 4: Calculate the total FAS score only for participants who answered all questions
combined_data_clean$FAS_total <- apply(combined_data_clean[variables_to_convert2], 1, function(row) {
  if (any(is.na(row))) {
    return(NA)  # Set NA if any question is unanswered
  } else {
    return(sum(row))  # Calculate the sum if all questions are answered
  }
})

EQ5D 3L

combined_data_clean$EQ05 <- as.numeric(as.character(combined_data_clean$EQ05))
combined_data_clean$EQ06 <- as.numeric(as.character(combined_data_clean$EQ06))
combined_data_clean$EQ_08_01 <- as.numeric(as.character(combined_data_clean$EQ_08_01))


# Replace specific values in EQ05 and EQ06 columns
combined_data_clean <- combined_data_clean %>%
  mutate(
    EQ05 = ifelse(EQ05 == 4, 2, EQ05),  # Replace 4 with 2 in EQ05
    EQ06 = ifelse(EQ06 == 4, 2, EQ06)   # Replace 4 with 2 in EQ06
  )





combined_data_clean$EQ5D3Lcode <- paste0(combined_data_clean$EQ02, combined_data_clean$EQ03, combined_data_clean$EQ04, combined_data_clean$EQ05, combined_data_clean$EQ06)

combined_data_clean$EQ5D_3L <- eq5d(
  scores = combined_data_clean$EQ5D3Lcode, 
  version = "3L",        
  country = "Germany",  
  type = "VAS",
  ignore.invalid = TRUE  # Handle invalid cases gracefully
)

NOW grouping participants to PCS and - NON PCS groups (1= PCS , 0= non PSC)

combined_data_clean <- combined_data_clean %>%
  mutate(PCS_binary = case_when(
    # Condition 1: Participants who are in dftc3hc (or dftc3) with SY29 == 2 becomes 0
    ((QUESTNNR == "dftc3hc" & SY29 == 2) | (QUESTNNR == "dftc3" & SY29 == 2)) ~ 1,
    
    # Condition 2: Participants who are in dftc3hc with SY29 == 1
    #         or in dftc3hc with SY18_01 == 2 becomes 1
    ((QUESTNNR == "dftc3hc" & SY29 == 1) | (QUESTNNR == "dftc3hc" & SY18_01 == 2)) ~ 0,
    
    # If none of the conditions are met, return NA
    TRUE ~ NA_integer_
  ))

2x2 PCS vs Obesity and fisher exact test

# Assume you have a data frame called 'data' with:
# - a numeric variable 'BMI'
# - a variable 'PCS' indicating PCS status (e.g., 1 = PCS, 0 = non-PCS)


# Function to perform the analysis for a given BMI cutoff
analyze_bmi_cutoff <- function(combined_data_clean, cutoff) {
  # Create a binary variable for overweight vs. normal
  combined_data_clean <- combined_data_clean %>%
    mutate(
      overweight_binary = ifelse(BMI_after > cutoff, "Overweight", "Normal")
    )
  
  # Create a 2x2 contingency table of PCS vs overweight
  table_2x2 <- table(PCS = combined_data_clean$PCS_binary, Overweight = combined_data_clean$overweight_binary)
  print(paste("Contingency table for BMI cutoff >", cutoff))
  print(table_2x2)
  
  # Use Fisher's exact test to calculate the odds ratio and CI
  fisher_result <- fisher.test(table_2x2)
  print(fisher_result)
  
  return(fisher_result)
}

# Analyze using a BMI cutoff of >25
result_25 <- analyze_bmi_cutoff(combined_data_clean, cutoff = 25)
## [1] "Contingency table for BMI cutoff > 25"
##    Overweight
## PCS Normal Overweight
##   0     55         28
##   1    137        169
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table_2x2
## p-value = 0.0005282
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.420281 4.186944
## sample estimates:
## odds ratio 
##   2.417585
# Analyze using a BMI cutoff of >30
result_30 <- analyze_bmi_cutoff(combined_data_clean, cutoff = 30)
## [1] "Contingency table for BMI cutoff > 30"
##    Overweight
## PCS Normal Overweight
##   0     74          9
##   1    225         81
## 
##  Fisher's Exact Test for Count Data
## 
## data:  table_2x2
## p-value = 0.002024
## alternative hypothesis: true odds ratio is not equal to 1
## 95 percent confidence interval:
##  1.389426 7.025487
## sample estimates:
## odds ratio 
##   2.953125

Log regression of PCS vs BMI categorized

combined_data_clean$BMI_cat <- cut(combined_data_clean$BMI_after,
                             breaks = c(-Inf, 18.5, 24.9, 29.9, 34.9, 39.9, Inf),
                             labels = c('Underweight', 
                                        'Normal weight', 
                                        'Pre-obesity', 
                                        'Obesity class I', 
                                        'Obesity class II',
                                        'Obesity class III'),
                             include.lowest = TRUE,
                             right=TRUE)
tab <- table(combined_data_clean$PCS_binary, combined_data_clean$BMI_cat)
print(tab)
##    
##     Underweight Normal weight Pre-obesity Obesity class I Obesity class II
##   0           4            49          21               7                1
##   1          13           122          90              38               27
##    
##     Obesity class III
##   0                 1
##   1                16
combined_data_clean$BMI_cat <- relevel(combined_data_clean$BMI_cat, ref = "Normal weight")

model <- glm(PCS_binary ~ BMI_cat, data = combined_data_clean, family = binomial(link = "logit"))

summary(model)
## 
## Call:
## glm(formula = PCS_binary ~ BMI_cat, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                0.9122     0.1691   5.393 6.91e-08 ***
## BMI_catUnderweight         0.2665     0.5963   0.447   0.6550    
## BMI_catPre-obesity         0.5431     0.2955   1.838   0.0661 .  
## BMI_catObesity class I     0.7795     0.4447   1.753   0.0796 .  
## BMI_catObesity class II    2.3836     1.0318   2.310   0.0209 *  
## BMI_catObesity class III   1.8604     1.0445   1.781   0.0749 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 403.30  on 388  degrees of freedom
## Residual deviance: 386.23  on 383  degrees of freedom
##   (94 observations deleted due to missingness)
## AIC: 398.23
## 
## Number of Fisher Scoring iterations: 5

as there is very low number of non pcs participants in each obesity category as it can be seen i tried again with grouping them together

combined_data_clean$BMI_cat_grouped <- cut(combined_data_clean$BMI_after,
                                           breaks = c(-Inf, 18.5, 24.9, 29.9, Inf),
                                           labels = c('Underweight',
                                                      'Normal weight',
                                                      'Pre-obesity',
                                                      'Obesity'),
                                           include.lowest = TRUE,
                                           right = TRUE)


tab_grouped <- table(combined_data_clean$PCS_binary, combined_data_clean$BMI_cat_grouped)
print(tab_grouped)
##    
##     Underweight Normal weight Pre-obesity Obesity
##   0           4            49          21       9
##   1          13           122          90      81
combined_data_clean$BMI_cat_grouped <- relevel(combined_data_clean$BMI_cat_grouped, ref = "Normal weight")


model_grouped <- glm(PCS_binary ~ BMI_cat_grouped, data = combined_data_clean, family = binomial(link = "logit"))

summary(model_grouped)
## 
## Call:
## glm(formula = PCS_binary ~ BMI_cat_grouped, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  0.9122     0.1691   5.393 6.91e-08 ***
## BMI_cat_groupedUnderweight   0.2665     0.5963   0.447 0.654965    
## BMI_cat_groupedPre-obesity   0.5431     0.2955   1.838 0.066106 .  
## BMI_cat_groupedObesity       1.2850     0.3899   3.296 0.000982 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 403.30  on 388  degrees of freedom
## Residual deviance: 389.61  on 385  degrees of freedom
##   (94 observations deleted due to missingness)
## AIC: 397.61
## 
## Number of Fisher Scoring iterations: 4

logistic regresstions as BMI binary (cutof 30) as dependent variable and PCS with hrQoL measures as independent variables

1: obese 0 : non obese

combined_data_clean$BMI_binary <- ifelse(combined_data_clean$BMI_after >= 30, 1, 0)


# BMI ~ PCS_group + IMET
model1 <- glm(BMI_binary ~ PCS_binary + IMET_total, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group + EQ-5D-3L index score
model2 <- glm(BMI_binary ~ PCS_binary + EQ5D_3L, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group + EQ-5D VAS
model3 <- glm(BMI_binary ~ PCS_binary + EQ_08_01, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group + FAS Score
model4 <- glm(BMI_binary ~ PCS_binary + FAS_total, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

BMI binary vs PCS binary (alone)

# BMI ~ PCS_group
model10 <- glm(BMI_binary ~ PCS_binary , 
              data = combined_data_clean, 
              family = binomial(link = "logit"))
summary(model10)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.107      0.353  -5.968  2.4e-09 ***
## PCS_binary     1.085      0.376   2.886   0.0039 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 420.83  on 388  degrees of freedom
## Residual deviance: 410.66  on 387  degrees of freedom
##   (94 observations deleted due to missingness)
## AIC: 414.66
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group + IMET

summary(model1)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary + IMET_total, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.417234   0.388212  -6.227 4.77e-10 ***
## PCS_binary   0.860822   0.434426   1.982   0.0475 *  
## IMET_total   0.012546   0.006166   2.035   0.0419 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.27  on 363  degrees of freedom
## Residual deviance: 376.93  on 361  degrees of freedom
##   (119 observations deleted due to missingness)
## AIC: 382.93
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group + EQ-5D-3L index score

summary(model2)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary + EQ5D_3L, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  -1.3028     0.5542  -2.351   0.0187 *
## PCS_binary    0.7986     0.4034   1.980   0.0477 *
## EQ5D_3L      -0.9282     0.5016  -1.850   0.0642 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 405.92  on 378  degrees of freedom
## Residual deviance: 392.92  on 376  degrees of freedom
##   (104 observations deleted due to missingness)
## AIC: 398.92
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group + EQ-5D VAS

summary(model3)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary + EQ_08_01, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)   
## (Intercept) -0.77754    0.59215  -1.313  0.18915   
## PCS_binary   0.47230    0.43662   1.082  0.27937   
## EQ_08_01    -0.01734    0.00644  -2.693  0.00708 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 403.85  on 374  degrees of freedom
## Residual deviance: 386.72  on 372  degrees of freedom
##   (108 observations deleted due to missingness)
## AIC: 392.72
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group + FAS Score

summary(model4)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary + FAS_total, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.21220    0.53884  -5.961  2.5e-09 ***
## PCS_binary   0.46284    0.42723   1.083  0.27865    
## FAS_total    0.04822    0.01647   2.927  0.00342 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 396.90  on 370  degrees of freedom
## Residual deviance: 378.43  on 368  degrees of freedom
##   (112 observations deleted due to missingness)
## AIC: 384.43
## 
## Number of Fisher Scoring iterations: 4

checking models for colinearity

vif_values <- vif(model1)
print(vif_values)
## PCS_binary IMET_total 
##   1.197139   1.197139
vif_values <- vif(model2)
print(vif_values)
## PCS_binary    EQ5D_3L 
##   1.138709   1.138709
vif_values <- vif(model3)
print(vif_values)
## PCS_binary   EQ_08_01 
##   1.318723   1.318723
vif_values <- vif(model4)
print(vif_values)
## PCS_binary  FAS_total 
##   1.248504   1.248504

so there is no important co-linearity between predictors

logistic regresstions as BMI binary (cutof 30) as dependent variable and interaction of PCS and hrQoL measures as independent variables

# BMI ~ PCS_group * IMET
model5 <- glm(BMI_binary ~ PCS_binary * IMET_total, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group * EQ-5D-3L index score
model6 <- glm(BMI_binary ~ PCS_binary * EQ5D_3L, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group * EQ-5D VAS
model7 <- glm(BMI_binary ~ PCS_binary * EQ_08_01, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

# BMI ~ PCS_group * FAS Score
model8 <- glm(BMI_binary ~ PCS_binary * FAS_total, 
              data = combined_data_clean, 
              family = binomial(link = "logit"))

BMI ~ PCS_group * IMET

summary(model5)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary * IMET_total, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                        Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           -2.380552   0.475925  -5.002 5.68e-07 ***
## PCS_binary             0.810198   0.580135   1.397    0.163    
## IMET_total             0.010376   0.017852   0.581    0.561    
## PCS_binary:IMET_total  0.002474   0.019034   0.130    0.897    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 393.27  on 363  degrees of freedom
## Residual deviance: 376.91  on 360  degrees of freedom
##   (119 observations deleted due to missingness)
## AIC: 384.91
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group * EQ-5D-3L index score

summary(model6)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary * EQ5D_3L, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -1.8354     1.5255  -1.203    0.229
## PCS_binary           1.3684     1.5606   0.877    0.381
## EQ5D_3L             -0.3097     1.7028  -0.182    0.856
## PCS_binary:EQ5D_3L  -0.6839     1.7833  -0.384    0.701
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 405.92  on 378  degrees of freedom
## Residual deviance: 392.76  on 375  degrees of freedom
##   (104 observations deleted due to missingness)
## AIC: 400.76
## 
## Number of Fisher Scoring iterations: 4

BMI ~ PCS_group * EQ-5D VAS

summary(model7)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary * EQ_08_01, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)  
## (Intercept)          0.59937    1.19992   0.500   0.6174  
## PCS_binary          -1.04263    1.23918  -0.841   0.4001  
## EQ_08_01            -0.03670    0.01652  -2.222   0.0263 *
## PCS_binary:EQ_08_01  0.02276    0.01787   1.273   0.2029  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 403.85  on 374  degrees of freedom
## Residual deviance: 385.08  on 371  degrees of freedom
##   (108 observations deleted due to missingness)
## AIC: 393.08
## 
## Number of Fisher Scoring iterations: 5

BMI ~ PCS_group * FAS Score

summary(model8)
## 
## Call:
## glm(formula = BMI_binary ~ PCS_binary * FAS_total, family = binomial(link = "logit"), 
##     data = combined_data_clean)
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)    
## (Intercept)          -3.147836   0.935230  -3.366 0.000763 ***
## PCS_binary            0.372310   1.159726   0.321 0.748185    
## FAS_total             0.045577   0.035564   1.282 0.199998    
## PCS_binary:FAS_total  0.003364   0.040151   0.084 0.933227    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 396.90  on 370  degrees of freedom
## Residual deviance: 378.42  on 367  degrees of freedom
##   (112 observations deleted due to missingness)
## AIC: 386.42
## 
## Number of Fisher Scoring iterations: 4