bdd_teds/data_wrangling.R at main · LabAsim/bdd_teds · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
library(haven)
library(tidyverse)
options(lifecycle_verbosity = "warning")

# The data is saved in an encrypted file
if (exists("df_raw") == F) {
  # Load it just once
  df_raw <- read_sav("G:\\693 Georgina Krebs BDD and victimisation Jan2025.sav")
}

#############
# IMPORTANT #
#############

# See: https://datadictionary.teds.ac.uk/studies/data_processing/data_processing.htm#Double_entering_parent
# The variables having names ending
# in '1' contain data for this twin (whether elder or younger).
# The variables having names ending in '2' contain data for this twin's co-twin.


# Create a new df to store the variables that we will need
df <- data.frame(to_remove = rep(NA, nrow(df_raw)))

#################
# Miscellaneous #
#################

df$twin_id <- df_raw$randomtwinid
df$fam_id <- df_raw$randomfamid
df$twin_order <- df_raw$twin
df$random_twin_from_pair <- df_raw$random
df$school_cohort <- df_raw$cohort
df$school_cohort_fct <- haven::as_factor(df_raw$cohort)
df$sex_1 <- df_raw$sex1
df$sex_1_fct <- factor(
  # https://datadictionary.teds.ac.uk/studies/variable_lists/background_variables.htm
  df_raw$sex1,
  levels = c(0, 1),
  labels = c("Female", "Male")
)
df$sex_2 <- df_raw$sex2
df$zygosity_binary <- df_raw$zygos
df$zygosity_binary_fct <- factor(
  df_raw$zygos,
  levels = c(1, 2),
  labels = c("MZ", "DZ")
)
df$zygosity_ternary <- df_raw$x3zygos
df$zygosity_quinary <- df_raw$sexzyg
df$ses_1st_contact <- df_raw$ases
df$ethnic <- df_raw$aethnic
df$ethnic_fct <- factor(
  df_raw$aethnic,
  levels = c(0, 1),
  labels = c("other", "white")
)
df$exclude1 <- df_raw$exclude1
df$exclude2 <- df_raw$exclude2

##########
# AGE 12 #
##########

# MPVS at the age of 12
# See https://datadictionary.teds.ac.uk/studies/measures/12yr_measures.htm
# lcvicph1/2, lcvicpr1/2, lcvicso1/2, lcvicve1/2
df$mpvs_physical_12_1 <- df_raw$lcvicph1
df$mpvs_physical_12_2 <- df_raw$lcvicph2
df$mpvs_verbal_12_1 <- df_raw$lcvicve1
df$mpvs_verbal_12_2 <- df_raw$lcvicve2
df$mpvs_social_12_1 <- df_raw$lcvicso1
df$mpvs_social_12_2 <- df_raw$lcvicso2
df$mpvs_property_12_1 <- df_raw$lcvicpr1
df$mpvs_property_12_2 <- df_raw$lcvicpr2

df$mpvs_item_1_12_1 <- df_raw$lcvic011
df$mpvs_item_2_12_1 <- df_raw$lcvic021
df$mpvs_item_3_12_1 <- df_raw$lcvic031
df$mpvs_item_4_12_1 <- df_raw$lcvic041
df$mpvs_item_5_12_1 <- df_raw$lcvic051
df$mpvs_item_6_12_1 <- df_raw$lcvic061
df$mpvs_item_7_12_1 <- df_raw$lcvic071
df$mpvs_item_8_12_1 <- df_raw$lcvic081
df$mpvs_item_9_12_1 <- df_raw$lcvic091
df$mpvs_item_10_12_1 <- df_raw$lcvic101
df$mpvs_item_11_12_1 <- df_raw$lcvic111
df$mpvs_item_12_12_1 <- df_raw$lcvic121
df$mpvs_item_13_12_1 <- df_raw$lcvic131
df$mpvs_item_14_12_1 <- df_raw$lcvic141
df$mpvs_item_15_12_1 <- df_raw$lcvic151
df$mpvs_item_16_12_1 <- df_raw$lcvic161

df$mpvs_total_12_1 <- rowSums(
  x = df[, c(
    "mpvs_physical_12_1",
    "mpvs_verbal_12_1",
    "mpvs_social_12_1",
    "mpvs_property_12_1"
  )],
  na.rm = F
)

# Age
df$age_parent_12 <- df_raw$lpqage
df$age_teach_12_1 <- df_raw$ltqage1
df$age_teach_12_2 <- df_raw$ltqage2
df$age_child_12_1 <- df_raw$lcqage1
df$age_child_12_2 <- df_raw$lcqage2


##########
# AGE 14 #
##########

# MPVS at the age of 14
# See: https://datadictionary.teds.ac.uk/studies/derived_variables/14yr_derived_variables.htm#vic
df$mpvs_physical_parent_14_1 <- df_raw$npvicph1
df$mpvs_physical_parent_14_2 <- df_raw$npvicph2
df$mpvs_verbal_parent_14_1 <- df_raw$npvicve1
df$mpvs_verbal_parent_14_2 <- df_raw$npvicve2
df$mpvs_social_parent_14_1 <- df_raw$npvicso1
df$mpvs_social_parent_14_2 <- df_raw$npvicso2
df$mpvs_property_parent_14_1 <- df_raw$npvicpr1
df$mpvs_property_parent_14_2 <- df_raw$npvicpr2

df$mpvs_item_1_parent_14_1 <- df_raw$npvic011
df$mpvs_item_2_parent_14_1 <- df_raw$npvic021
df$mpvs_item_3_parent_14_1 <- df_raw$npvic031
df$mpvs_item_4_parent_14_1 <- df_raw$npvic041
df$mpvs_item_5_parent_14_1 <- df_raw$npvic051
df$mpvs_item_6_parent_14_1 <- df_raw$npvic061
df$mpvs_item_7_parent_14_1 <- df_raw$npvic071
df$mpvs_item_8_parent_14_1 <- df_raw$npvic081
df$mpvs_item_9_parent_14_1 <- df_raw$npvic091
df$mpvs_item_10_parent_14_1 <- df_raw$npvic101
df$mpvs_item_11_parent_14_1 <- df_raw$npvic111
df$mpvs_item_12_parent_14_1 <- df_raw$npvic121
df$mpvs_item_13_parent_14_1 <- df_raw$npvic131
df$mpvs_item_14_parent_14_1 <- df_raw$npvic141
df$mpvs_item_15_parent_14_1 <- df_raw$npvic151
df$mpvs_item_16_parent_14_1 <- df_raw$npvic161


df$mpvs_total_parent_14_1 <- rowSums(
  x = df[, c(
    "mpvs_physical_parent_14_1",
    "mpvs_verbal_parent_14_1",
    "mpvs_social_parent_14_1",
    "mpvs_property_parent_14_1"
  )]
)

df$mpvs_physical_child_14_1 <- df_raw$ncvicph1
df$mpvs_physical_child_14_2 <- df_raw$ncvicph2
df$mpvs_verbal_child_14_1 <- df_raw$ncvicve1
df$mpvs_verbal_child_14_2 <- df_raw$ncvicve2
df$mpvs_social_child_14_1 <- df_raw$ncvicso1
df$mpvs_social_child_14_2 <- df_raw$ncvicso2
df$mpvs_property_child_14_1 <- df_raw$ncvicpr1
df$mpvs_property_child_14_2 <- df_raw$ncvicpr2

df$mpvs_item_1_child_14_1 <- df_raw$ncvic011
df$mpvs_item_2_child_14_1 <- df_raw$ncvic021
df$mpvs_item_3_child_14_1 <- df_raw$ncvic031
df$mpvs_item_4_child_14_1 <- df_raw$ncvic041
df$mpvs_item_5_child_14_1 <- df_raw$ncvic051
df$mpvs_item_6_child_14_1 <- df_raw$ncvic061
df$mpvs_item_7_child_14_1 <- df_raw$ncvic071
df$mpvs_item_8_child_14_1 <- df_raw$ncvic081
df$mpvs_item_9_child_14_1 <- df_raw$ncvic091
df$mpvs_item_10_child_14_1 <- df_raw$ncvic101
df$mpvs_item_11_child_14_1 <- df_raw$ncvic111
df$mpvs_item_12_child_14_1 <- df_raw$ncvic121
df$mpvs_item_13_child_14_1 <- df_raw$ncvic131
df$mpvs_item_14_child_14_1 <- df_raw$ncvic141
df$mpvs_item_15_child_14_1 <- df_raw$ncvic151
df$mpvs_item_16_child_14_1 <- df_raw$ncvic161


df$mpvs_total_child_14_1 <- rowSums(
  x = df[, c(
    "mpvs_physical_child_14_1",
    "mpvs_verbal_child_14_1",
    "mpvs_social_child_14_1",
    "mpvs_property_child_14_1"
  )]
)

df$mpvs_physical_teacher_14_1 <- df_raw$ntvicph1
df$mpvs_physical_teacher_14_2 <- df_raw$ntvicph2
df$mpvs_verbal_teacher_14_1 <- df_raw$ntvicve1
df$mpvs_verbal_teacher_14_2 <- df_raw$ntvicve2
df$mpvs_social_teacher_14_1 <- df_raw$ntvicso1
df$mpvs_social_teacher_14_2 <- df_raw$ntvicso2
df$mpvs_property_teacher_14_1 <- df_raw$ntvicpr1
df$mpvs_property_teacher_14_2 <- df_raw$ntvicpr2

df$mpvs_item_1_teacher_14_1 <- df_raw$ntvic011
df$mpvs_item_2_teacher_14_1 <- df_raw$ntvic021
df$mpvs_item_3_teacher_14_1 <- df_raw$ntvic031
df$mpvs_item_4_teacher_14_1 <- df_raw$ntvic041
df$mpvs_item_5_teacher_14_1 <- df_raw$ntvic051
df$mpvs_item_6_teacher_14_1 <- df_raw$ntvic061
df$mpvs_item_7_teacher_14_1 <- df_raw$ntvic071
df$mpvs_item_8_teacher_14_1 <- df_raw$ntvic081
df$mpvs_item_9_teacher_14_1 <- df_raw$ntvic091
df$mpvs_item_10_teacher_14_1 <- df_raw$ntvic101
df$mpvs_item_11_teacher_14_1 <- df_raw$ntvic111
df$mpvs_item_12_teacher_14_1 <- df_raw$ntvic121
df$mpvs_item_13_teacher_14_1 <- df_raw$ntvic131
df$mpvs_item_14_teacher_14_1 <- df_raw$ntvic141
df$mpvs_item_15_teacher_14_1 <- df_raw$ntvic151
df$mpvs_item_16_teacher_14_1 <- df_raw$ntvic161


df$mpvs_total_teacher_14_1 <- rowSums(
  x = df[, c(
    "mpvs_physical_teacher_14_1",
    "mpvs_verbal_teacher_14_1",
    "mpvs_social_teacher_14_1",
    "mpvs_property_teacher_14_1"
  )]
)

# Age
df$age_parent_14 <- df_raw$npqage
df$age_teach_14_1 <- df_raw$ntqage1
df$age_teach_14_2 <- df_raw$ntqage2
df$age_child_14_1 <- df_raw$ncqage1
df$age_child_14_2 <- df_raw$ncqage2

##########
# AGE 16 #
##########

# MPVS at 16 years
# See: https://datadictionary.teds.ac.uk/studies/derived_variables/16yr_derived_variables.htm#pcpevit
df$mpvs_total_16_1 <- df_raw$pcpevit1
df$mpvs_total_16_2 <- df_raw$pcpevit2
df$mpvs_item_1_16_1 <- df_raw$pcqbpevi11
df$mpvs_item_2_16_1 <- df_raw$pcqbpevi21
df$mpvs_item_3_16_1 <- df_raw$pcqbpevi31
df$mpvs_item_4_16_1 <- df_raw$pcqbpevi41
df$mpvs_item_5_16_1 <- df_raw$pcqbpevi51
df$mpvs_item_6_16_1 <- df_raw$pcqbpevi61

# Eating Disorders Diagnostic Scale
df$eat_dis_scale_16_1 <- df_raw$pcbheddsm1
df$eat_dis_scale_16_2 <- df_raw$pcbheddsm2

# Age
df$age_child_web_16_1 <- df_raw$pcwebage1
df$age_child_web_16_2 <- df_raw$pcwebage2
df$age_child_booklet_16_1 <- df_raw$pcbhage1
df$age_child_booklet_16_2 <- df_raw$pcbhage2
df$age_parent_16 <- df_raw$ppbhage
df$age_leap_study_parent_16 <- df_raw$ppl2age

##########
# AGE 21 #
##########

# MPVS at 21 years
# See: https://datadictionary.teds.ac.uk/studies/derived_variables/21yr_derived_variables.htm#u2cvict
df$cyber_bullying_phase1_1 <- df_raw$u1cobult1
df$cyber_bullying_phase1_2 <- df_raw$u1cobult2
df$mpvs_total_phase_2_21_1 <- df_raw$u2cvictt1
df$mpvs_total_phase_2_21_2 <- df_raw$u2cvictt2


df$mpvs_item_1_phase_2_21_1 <- df_raw$u2cvict011
df$mpvs_item_2_phase_2_21_1 <- df_raw$u2cvict021
df$mpvs_item_3_phase_2_21_1 <- df_raw$u2cvict031
df$mpvs_item_4_phase_2_21_1 <- df_raw$u2cvict041
df$mpvs_item_5_phase_2_21_1 <- df_raw$u2cvict051
df$mpvs_item_6_phase_2_21_1 <- df_raw$u2cvict061
df$mpvs_item_7_phase_2_21_1 <- df_raw$u2cvict071
df$mpvs_item_8_phase_2_21_1 <- df_raw$u2cvict081
df$mpvs_item_9_phase_2_21_1 <- df_raw$u2cvict091
df$mpvs_item_10_phase_2_21_1 <- df_raw$u2cvict101
df$mpvs_item_11_phase_2_21_1 <- df_raw$u2cvict111
df$mpvs_item_12_phase_2_21_1 <- df_raw$u2cvict121
df$mpvs_item_13_phase_2_21_1 <- df_raw$u2cvict131
df$mpvs_item_14_phase_2_21_1 <- df_raw$u2cvict141
df$mpvs_item_15_phase_2_21_1 <- df_raw$u2cvict151
df$mpvs_item_16_phase_2_21_1 <- df_raw$u2cvict161

df$mpvs_item_1_cov1_21_1 <- df_raw$ucv1vict011
df$mpvs_item_2_cov1_21_1 <- df_raw$ucv1vict021
df$mpvs_item_3_cov1_21_1 <- df_raw$ucv1vict031
df$mpvs_item_4_cov1_21_1 <- df_raw$ucv1vict041
df$mpvs_item_5_cov1_21_1 <- df_raw$ucv1vict051
df$mpvs_item_6_cov1_21_1 <- df_raw$ucv1vict061
df$mpvs_item_7_cov1_21_1 <- df_raw$ucv1vict071
df$mpvs_item_8_cov1_21_1 <- df_raw$ucv1vict081
df$mpvs_item_9_cov1_21_1 <- df_raw$ucv1vict091
df$mpvs_item_10_cov1_21_1 <- df_raw$ucv1vict101
df$mpvs_item_11_cov1_21_1 <- df_raw$ucv1vict111
df$mpvs_item_12_cov1_21_1 <- df_raw$ucv1vict121
df$mpvs_item_1_cov2_21_1 <- df_raw$ucv2vict011
df$mpvs_item_2_cov2_21_1 <- df_raw$ucv2vict021
df$mpvs_item_3_cov2_21_1 <- df_raw$ucv2vict031
df$mpvs_item_4_cov2_21_1 <- df_raw$ucv2vict041
df$mpvs_item_5_cov2_21_1 <- df_raw$ucv2vict051
df$mpvs_item_6_cov2_21_1 <- df_raw$ucv2vict061
df$mpvs_item_7_cov2_21_1 <- df_raw$ucv2vict071
df$mpvs_item_8_cov2_21_1 <- df_raw$ucv2vict081
df$mpvs_item_9_cov2_21_1 <- df_raw$ucv2vict091
df$mpvs_item_10_cov2_21_1 <- df_raw$ucv2vict101
df$mpvs_item_11_cov2_21_1 <- df_raw$ucv2vict111
df$mpvs_item_12_cov2_21_1 <- df_raw$ucv2vict121
df$mpvs_item_1_cov3_21_1 <- df_raw$ucv3vict011
df$mpvs_item_2_cov3_21_1 <- df_raw$ucv3vict021
df$mpvs_item_3_cov3_21_1 <- df_raw$ucv3vict031
df$mpvs_item_4_cov3_21_1 <- df_raw$ucv3vict041
df$mpvs_item_5_cov3_21_1 <- df_raw$ucv3vict051
df$mpvs_item_6_cov3_21_1 <- df_raw$ucv3vict061
df$mpvs_item_7_cov3_21_1 <- df_raw$ucv3vict071
df$mpvs_item_8_cov3_21_1 <- df_raw$ucv3vict081
df$mpvs_item_9_cov3_21_1 <- df_raw$ucv3vict091
df$mpvs_item_10_cov3_21_1 <- df_raw$ucv3vict101
df$mpvs_item_11_cov3_21_1 <- df_raw$ucv3vict111
df$mpvs_item_12_cov3_21_1 <- df_raw$ucv3vict121
df$mpvs_item_1_cov4_21_1 <- df_raw$ucv4vict011
df$mpvs_item_2_cov4_21_1 <- df_raw$ucv4vict021
df$mpvs_item_3_cov4_21_1 <- df_raw$ucv4vict031
df$mpvs_item_4_cov4_21_1 <- df_raw$ucv4vict041
df$mpvs_item_5_cov4_21_1 <- df_raw$ucv4vict051
df$mpvs_item_6_cov4_21_1 <- df_raw$ucv4vict061
df$mpvs_item_7_cov4_21_1 <- df_raw$ucv4vict071
df$mpvs_item_8_cov4_21_1 <- df_raw$ucv4vict081
df$mpvs_item_9_cov4_21_1 <- df_raw$ucv4vict091
df$mpvs_item_10_cov4_21_1 <- df_raw$ucv4vict101
df$mpvs_item_11_cov4_21_1 <- df_raw$ucv4vict111
df$mpvs_item_12_cov4_21_1 <- df_raw$ucv4vict121

df$mpvs_total_cov1_21_1 <- df_raw$ucv1victt1
df$mpvs_total_cov1_21_2 <- df_raw$ucv1victt2
df$mpvs_total_cov2_21_1 <- df_raw$ucv2victt1
df$mpvs_total_cov2_21_2 <- df_raw$ucv2victt2
df$mpvs_total_cov3_21_1 <- df_raw$ucv3victt1
df$mpvs_total_cov3_21_2 <- df_raw$ucv3victt2
df$mpvs_total_cov4_21_1 <- df_raw$ucv4victt1
df$mpvs_total_cov4_21_2 <- df_raw$ucv4victt2

# Anorexia nervosa diagnosis
df$anorexia_diag_21_phase1_1 <- df_raw$u1ceatd11
df$anorexia_diag_21_phase1_2 <- df_raw$u1ceatd12
# Bulimia
df$bulimia_diag_21_phase1_1 <- df_raw$u1ceatd21
df$bulimia_diag_21_phase1_2 <- df_raw$u1ceatd22
# Binge eating disorder
df$binge_eat_diag_21_phase1_1 <- df_raw$u1ceatd31
df$binge_eat_diag_21_phase1_2 <- df_raw$u1ceatd32

# Eating disorders symptoms scale
# https://datadictionary.teds.ac.uk/studies/derived_variables/21yr_derived_variables.htm#eats

# Binge-eating total score (TEDS21 phase 1 twin qnr), 0-15
df$bing_eat_scale_phase1_1 <- df_raw$u1ceatsbint1
df$bing_eat_scale_phase1_2 <- df_raw$u1ceatsbint2

# Body preoccupation total score (0-40)
df$body_preoccup_phase1_1 <- df_raw$u1ceatsbodt1
df$body_preoccup_phase1_2 <- df_raw$u1ceatsbodt2

# Age
df$age_phase1_parent_21 <- df_raw$u1page
df$age_phase1_child_21_1 <- df_raw$u1cage1
df$age_phase1_child_21_2 <- df_raw$u1cage2
df$age_phase2_child_21_1 <- df_raw$u2cage1
df$age_phase2_child_21_2 <- df_raw$u2cage2
df$age_cov1_child_21_1 <- df_raw$ucv1age1
df$age_cov1_child_21_2 <- df_raw$ucv1age2
df$age_cov2_child_21_1 <- df_raw$ucv2age1
df$age_cov2_child_21_2 <- df_raw$ucv2age2
df$age_cov3_child_21_1 <- df_raw$ucv3age1
df$age_cov3_child_21_2 <- df_raw$ucv3age2
df$age_cov4_child_21_1 <- df_raw$ucv4age1
df$age_cov4_child_21_2 <- df_raw$ucv4age2


##########
# AGE 26 #
##########

# DCQ total score at age 26
# Total scale, from all 7 items of the DCQ-BDD measure in the twin MHQ.
# Each item has values 0/1/2/3, hence the scale values have range 0 to 21.
# See: (https://datadictionary.teds.ac.uk/studies/derived_variables/26yr_derived_variables.htm#zmhbddt)
df$dcq_total_26_1 <- df_raw$zmhbddt1
df$dcq_total_26_2 <- df_raw$zmhbddt2

df$dcq_item_1_26_1 <- df_raw$zmhbdd11
df$dcq_item_2_26_1 <- df_raw$zmhbdd21
df$dcq_item_3_26_1 <- df_raw$zmhbdd31
df$dcq_item_4_26_1 <- df_raw$zmhbdd41
df$dcq_item_5_26_1 <- df_raw$zmhbdd51
df$dcq_item_6_26_1 <- df_raw$zmhbdd61
df$dcq_item_7_26_1 <- df_raw$zmhbdd71

df$bdd_diagnosis_26_1 <- df_raw$zmhmhddx1m1
df$bdd_diagnosis_26_2 <- df_raw$zmhmhddx1m2

# Age
df$age_26_1 <- df_raw$zmhage1
df$age_26_2 <- df_raw$zmhage2

# Eating disorders
# Derived
df$anorexia_derived_26_1 <- df_raw$zmheatdandiag1
df$anorexia_derived_fct_26_1 <- factor(
  df_raw$zmheatdandiag1,
  levels = c(0, 1, 2, 3),
  labels = c(
    "No diagnosis", "Without subtype", "Restricting", "Purging/binge eating"
  )
)
df$binge_derived_26_1 <- df_raw$zmheatdbediag1
df$binge_derived_fct_26_1 <- factor(
  df_raw$zmheatdbediag1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
df$bulimia_derived_26_1 <- df_raw$zmheatdbndiag1
df$bulimia_derived_fct_26_1 <- factor(
  df_raw$zmheatdbndiag1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)

derived_items <- c("anorexia_derived_26_1", "binge_derived_26_1", "bulimia_derived_26_1")
df <- df |>
  mutate(
    eating_derived_26_1 = case_when(
      # See here: https://stackoverflow.com/a/72597660
      # https://stackoverflow.com/questions/79590966/difference-between-if-anyany-ofvars-and-if-anyall-ofvars
      if_all(all_of(derived_items), is.na) ~ NA_character_,
      if_all(all_of(derived_items), ~ .x == 0) ~ "No",
      bulimia_derived_26_1 == 1 ~ "Yes",
      binge_derived_26_1 == 1 ~ "Yes",
      anorexia_derived_26_1 == 1 | anorexia_derived_26_1 == 2 | anorexia_derived_26_1 == 3 ~ "Yes",
      .default = "No"
    )
  )
df$eating_derived_fct_26_1 <- as.factor(df$eating_derived_26_1)
# View(df[, c("eating_derived_26_1", derived_items)])

# Ever diagnosed by a professional
# See the codes here:
# https://datadictionary.teds.ac.uk/pdfs/26yr/26yr_mhq_coding.pdf
df$anorexia_lifetime_26_1 <- df_raw$zmhmhddx2a1
df$anorexia_lifetime_fct_26_1 <- factor(
  df_raw$zmhmhddx2a1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
df$bulimia_lifetime_26_1 <- df_raw$zmhmhddx2b1
df$bulimia_lifetime_fct_26_1 <- factor(
  df_raw$zmhmhddx2b1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
df$over_eating_lifetime_26_1 <- df_raw$zmhmhddx2c1
df$over_eating_lifetime_fct_26_1 <- factor(
  df_raw$zmhmhddx2c1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
df$binge_lifetime_26_1 <- df_raw$zmhmhddx2d1
df$binge_lifetime_fct_26_1 <- factor(
  df_raw$zmhmhddx2d1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
df$eating_other_lifetime_26_1 <- df_raw$zmhmhddx2e1
df$eating_other_lifetime_fct_26_1 <- factor(
  df_raw$zmhmhddx2e1,
  levels = c(0, 1),
  labels = c("No", "Yes")
)
lifetime_items <- c(
  "anorexia_lifetime_26_1",
  "binge_lifetime_26_1",
  "over_eating_lifetime_26_1",
  "bulimia_lifetime_26_1",
  "eating_other_lifetime_26_1"
)
df <- df |>
  mutate(
    eating_diagnosis_26_1 = case_when(
      # See here: https://stackoverflow.com/a/72597660
      # https://stackoverflow.com/questions/79590966/difference-between-if-anyany-ofvars-and-if-anyall-ofvars
      if_any(all_of(lifetime_items), ~ .x == 1) ~ "Yes",
      if_all(all_of(lifetime_items), is.na) ~ NA,
      .default = "No"
    )
  )
df$eating_diagnosis_fct_26_1 <- factor(
  df$eating_diagnosis_26_1,
  levels = c("No", "Yes"),
  labels = c("No", "Yes")
)
table(
  df$eating_diagnosis_26_1,
  df$eating_diagnosis_fct_26_1,
  deparse.level = 2,
  useNA = "always"
)
table(df$eating_derived_fct_26_1, df$eating_diagnosis_fct_26_1,
  deparse.level = 2, useNA = "always"
)
summary(
  mutate_if(
    df[, c("eating_diagnosis_fct_26_1", lifetime_items, "eating_derived_fct_26_1", derived_items)], function(x) {
      return(is.numeric(x) | is.character(x))
    }, as.factor
  )
)


df$dcq_total_26_1_cutoff11 <- ifelse(
  test = is.na(df$dcq_total_26_1) == T,
  yes = NA,
  no = ifelse(
    test = df$dcq_total_26_1 >= 11,
    yes = "Yes",
    no = "No"
  )
)
df$dcq_total_26_1_cutoff11 <- factor(df$dcq_total_26_1_cutoff11)
df$dcq_total_26_1_cutoff11_numeric <- as.numeric(df$dcq_total_26_1_cutoff11)


df$dcq_total_26_1_cutoff17 <- ifelse(
  test = is.na(df$dcq_total_26_1) == T,
  yes = NA,
  no = ifelse(
    test = df$dcq_total_26_1 >= 17,
    yes = "Yes",
    no = "No"
  )
)
df$dcq_total_26_1_cutoff17 <- factor(df$dcq_total_26_1_cutoff17)
df$dcq_total_26_1_cutoff17_numeric <- as.numeric(df$dcq_total_26_1_cutoff17)
###############
# Save raw df #
###############
df_raw_named <- df %>%
  dplyr::select(-all_of("to_remove"))

#######################
# Drop excluded twins #
#######################
# See https://datadictionary.teds.ac.uk/exclusions.htm
df$aperinat <- df_raw$aperinat
df$sexzyg <- df_raw$sexzyg
df$acontact <- df_raw$acontact
df <- df[df$exclude1 == 0, ]
df <- df[df$exclude2 == 0, ]
df <- df[df$acontact == 1, ]
df <- df[df$sexzyg != 7, ]
df <- df[df$aperinat == 0, ]

N_WITH_EXCLUDED_TWINS <- NROW(df_raw)
N_EXCLUDED_TWINS <- NROW(df_raw) - NROW(df[df$exclude2 == 0 & df$exclude1 == 0, ])
# The excluded twins
sprintf(
  "Excluded %d twins", N_EXCLUDED_TWINS
)
# NROW(df[df$sexzyg == 7 | df$acontact == 0 | df$aperinat == 1, ] )
# We don't have the medexcluded,
# thus there is a difference between the above and below line
# NROW(df) - NROW(df[df$exclude2 == 0 & df$exclude1==0, ])
# NROW(df[df$exclude2 == 1 | df$exclude1==1, ])


summary(
  mutate_if(
    df[, c("eating_diagnosis_fct_26_1", lifetime_items, "eating_derived_fct_26_1", derived_items)], function(x) {
      return(is.numeric(x) | is.character(x))
    }, as.factor
  )
)
# df <- df[df$exclude2 == 0, ]
# summary(
#   mutate_if(
#     df[, c("eating_diagnosis_fct_26_1", lifetime_items, "eating_derived_fct_26_1", derived_items)], function(x) {
#       return(is.numeric(x) | is.character(x))
#     }, as.factor
#   )
# )
############################################################################
# Last but not least, drop only the rows that contain NA in MPVS columns!
# We can impute the remaining NA!
# See: https://stackoverflow.com/a/70325350
# https://www.geeksforgeeks.org/how-to-check-if-characters-are-present-in-a-string-in-r/
############################################################################

source("helper.R")

df_raw_named_without_excluded <- df
df_raw_named_without_excluded_1 <- df_raw_named_without_excluded %>%
  select(!matches("_2$"))

summary(
  mutate_if(
    df_raw_named_without_excluded[, c("eating_diagnosis_fct_26_1", lifetime_items, "eating_derived_fct_26_1", derived_items)], function(x) {
      return(is.numeric(x) | is.character(x))
    }, as.factor
  )
)
# Drop rows that contain ONLY NA's in mpvs (items  + totals + subscales)
# df <- df %>%
#   #filter(!if_all(colnames(df), is.na))
#   filter(
#     !if_all(
#       # Get the column names containing "mpvs"
#       colnames(df)[grepl(pattern="mpvs", x=colnames(df))],
#       is.na
#     )
#   )
#
# Drop rows that contain ONLY NA's in MPVS total scores
# df <- df %>%
#   filter(
#     !if_all(
#       colnames(df)[grepl(pattern="mpvs_total", x=colnames(df))],
#       is.na
#     )
#   )
# df <- remove_twins_without_var_decorated(
#   df=df,
#   group_var = "fam_id",
#   sex_var = "sex_1",
#   pattern = "dcq_item",
#   keep_empty_cotwin = T,
#   NA_threshold = 7
# )


df <- remove_twins_without_var_decorated(
  df = df_raw_named_without_excluded_1,
  group_var = "fam_id",
  sex_var = "sex_1",
  pattern = "dcq_total_26_1$",
  keep_empty_cotwin = T,
  NA_threshold = 1
)


N_WITHOUT_DCQ_TOTAL <- NROW(df_raw_named_without_excluded_1) - NROW(df)
N_WITH_DCQ_TOTAL <- NROW(df)
sprintf(
  "%d had no DCQ total", N_WITHOUT_DCQ_TOTAL
)


df <- remove_twins_without_var_decorated(
  df = df,
  group_var = "fam_id",
  sex_var = "sex_1",
  pattern = "mpvs_total",
  antipattern = list("cov", "teacher", "parent"),
  keep_empty_cotwin = T,
  NA_threshold = 4
  # Four columns (excluding cov + teacher + parent)
  # Six columns of MPVS total scores (without covid vars)
  # Ten columns of MPVS total scores
)

N_WITHOUT_MPVS_TOTAL <- N_WITH_DCQ_TOTAL - NROW(df)
sprintf(
  "%d had no MPVS total", N_WITHOUT_MPVS_TOTAL
)


df <- df %>% dplyr::select(-all_of("to_remove"))

# Drop the cotwin variables
df_1 <- df %>% select(!matches("_2$"))

rm(df)


df_1 <- df_1 %>% fill_multiple_vars_twin_from_cotwin(
  vars = c(
    colnames(
      df_1
    )[grepl(pattern = "age", x = colnames(df_1))] %>% purrr::discard(is.na)
  )
)

# df_1 <- fill_var(
#   df=df_1,
#   primary = "age_parent_12",
#   secondary = "age_child_12_1",
#   tertiary = "age_teach_12_1",
#   new_column = "age_12_1"
# )
#
# df_1 <- fill_var(
#   df=df_1,
#   primary = "age_parent_14",
#   secondary = "age_child_14_1",
#   tertiary = "age_teach_14_1",
#   new_column = "age_14_1"
# )
#
# df_1 <-fill_var(
#   df=df_1,
#   primary = "mpvs_total_child_14_1",
#   secondary = "mpvs_total_parent_14_1",
#   tertiary = "mpvs_total_teacher_14_1",
#   new_column = "mpvs_total_14_1"
# )

# At age 16, MPVS questionnaire  was answered only by the twins
# (cohort 1 & 2) during the web study.
# From TEDS' website: https://datadictionary.teds.ac.uk/studies/16yr.htm
# The initial booklet study was administered in two waves:
# wave 1 (cohort 1), called the Behaviour study,
# started immediately after the end of cohort 1's web study;
# wave 2 (cohorts 2, 3 and 4), called the LEAP study,
# started after the end of cohort 2's web study.
# Twin ages ranged from roughly 15 (cohort 4) up to 17.5 (cohorts 1 and 2)
# when booklets were returned.
# Thus, first, we need age_web_16_1.
# If NA exists, we could pull age from parent and then, from child.

# df_1 <- fill_var(
#   df=df_1,
#   primary = "age_web_16_1",
#   secondary = "age_parent_16",
#   tertiary = "age_child_16_1",
#   new_column = "age_16_1"
# )


# At age 21, MPVS questionnaire was answered twin phase1,
# Covid phase 1,2,3 & 4.

df_1 <- fill_age_covid_21(df = df_1)
df_1 <- fill_age_covid_21(df = df_1, order = "desceding")


# Fill age from co-twin
df_1 <- df_1 %>% fill_multiple_vars_twin_from_cotwin(
  vars = c(
    colnames(
      df_1
    )[grepl(pattern = "age_cov", x = colnames(df_1))] %>% purrr::discard(is.na)
  )
)


df_1 <- df_1 %>% fill_multiple_vars_twin_from_cotwin(
  vars = c(
    colnames(
      df_1
    )[grepl(pattern = "age_phase", x = colnames(df_1))] %>% purrr::discard(is.na)
  )
)

# df_1 <- scale_mpvs(df=df_1)

# Create a variable representing mean MPVS across the waves
# at age 21

# df_1 <- df_1 %>%
#   mutate(
#     mpvs_total_21_1 = rowMeans(
#       select(
#         df_1,
#         mpvs_total_phase_2_21_1,
#         mpvs_total_cov1_21_1,
#         mpvs_total_cov2_21_1,
#         mpvs_total_cov3_21_1,
#         mpvs_total_cov4_21_1
#       ),
#       na.rm = T
#     )
#   )

# df_1 <- df_1 %>%
#   mutate(
#     mpvs_total_21_scaled_32 = rowMeans(
#       select(
#         df_1,
#         mpvs_total_21_phase_2_1_scaled_32,
#         mpvs_total_21_cov1_1_scaled_32,
#         mpvs_total_21_cov2_1_scaled_32,
#         mpvs_total_21_cov3_1_scaled_32,
#         mpvs_total_21_cov4_1_scaled_32
#       ),
#       na.rm = T
#     )
#   )

# df_1 <- df_1 %>%
#   mutate(
#     age_21_1 = rowMeans(
#       select(
#         .,
#         age_cov1_child_21_1,
#         age_cov2_child_21_1,
#         age_cov3_child_21_1,
#         age_cov4_child_21_1,
#         age_phase2_child_21_1
#       ),
#       na.rm = T
#     )
#   )


########################################
# Save the image to the encrypted disk #
# to load it faster next time          #
########################################
if (sys.nframe() == 0) {
  save.image(file = "G:\\data_wrangling.RData")
}