3. Data Transformation in R (tidyverse)

Introduction

Topics

R for Data Science - Data Transformation
dplyr
part of the tidyverse universe

ggplot

*no more than 9 colors

Could do a geom_point, geom_line graph looking at patient_id and time to use group_by, geom_point, geom_line, x/y/color Also use boxplot to do statistics on dataset

Test Data Preparation:

For these examples, begin by loading the required packages: (only tidyverse is required for these particular tutorials)

library("tidylog"); packageVersion("tidylog")
library("tidyverse"); packageVersion("tidyverse")

Then load the test data:

data <- read.table("./data.txt", sep="\t", header=TRUE)

To download the data, click to open the data link (below) and then right-click to download. It is tab-delimited.

data

select

select specific features (columns) of interest from the data set

tax_info <- select(data, Kingdom, Phylum, Class, Order, Family, Genus, Species) %>%  
  distinct() 
head(tax_info,1)

Kingdom	Phylum	Class	Order	Family	Genus	Species
Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species

distinct

Identify all the distinct entries for the Kingdom variable by summarising Kingdom and asking for only those that are distinct entries

summarise(data, Kingdom) %>%
  distinct()

	Kingdom
1	Viruses
2	Kingdom_undefined
3	not classified

filter

filter out irrelevant data and only keep observations (rows) of interest

# Keep only rows whose Kingdom column contains Viruses

viral_data <- filter(data, Kingdom == "Viruses")   
head(viral_data,2)

OTU	Sample	Abundance	participantID	cohort	age	BMI	gender	race	Kingdom	Phylum	Class	Order	Family	Genus	Species	Baltimore	Baltimore.Group
sp1	22	373339.2	22	IBS-D	29	28.0	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II
sp1	38	221155.4	38	IBS-D	28	20.6	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II

# Keep only rows whose age column is <= 28

age_data <- filter(data, age <= 28)  
head(age_data,2)

OTU	Sample	Abundance	participantID	cohort	age	BMI	gender	race	Kingdom	Phylum	Class	Order	Family	Genus	Species	Baltimore	Baltimore.Group
sp1	38	221155.4	38	IBS-D	28	20.6	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II
sp1	63	213158.0	63	IBS-C	21	24.92	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II

mutate

mutate a data set by adding more features (columns)

# Create a new column to categorize BMI

bmi.cat <- data %>% 
  mutate(bmi_categories = if_else(BMI < 18.5, "underweight", 
                                  ifelse(BMI >=18.5 & BMI <= 24.9, "normal",
                                         ifelse(BMI >=25 & BMI <= 29.9, "overweight", "obese")))) 

head(bmi.cat,2)

OTU	Sample	Abundance	participantID	cohort	age	BMI	gender	race	Kingdom	Phylum	Class	Order	Family	Genus	Species	Baltimore	Baltimore.Group	bmi_categories
sp1	22	373339.2	22	IBS-D	29	28.0	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II	overweight
sp1	38	221155.4	38	IBS-D	28	20.6	female	white	Viruses	Phixviricota	Malgrandaviricetes	Petitvirales	Microviridae	Microviridae_undefined_genus	Microviridae_undefined_genus_undefined_species	ssDNA	II	normal

#to double check the new bmi_categories column you can select the bmi_categories and BMI columns to appear first in tibble with everything else after.

select(bmi.cat, bmi_categories, BMI, everything())

#What's up with row 4 with BMI of 24.9 labeled obese? Look at:

bmi.cat$BMI

#BMI values go to the hundredths place so we should edit the code above to include the right number of significant digits.

bmi.cat2 <- data %>% 
  mutate(bmi_categories = if_else(BMI < 18.50, "underweight", 
                                  ifelse(BMI >=18.50 & BMI <= 24.99, "normal",
                                         ifelse(BMI >=25.00 & BMI <= 29.99, "overweight", "obese")))) %>%
  select(bmi_categories, BMI, everything())
bmi.cat2

arrange

arrange the observations (rows) in a particular order

# Arrange data by INCREASING order of the age column
default = increasing order

sorted_age.up <- arrange (data, age)
sorted_age.up %>%
  select(participantID, age) %>% 
  distinct() %>% 
  head()

	participantID	age
1	63	21
2	47	26
3	73	27
4	38	28
5	21	28
6	22	29

# Arrange data by DECREASING order of the age column
for decreasing order add - in front of column name

sorted_age.down <- arrange (data, -age)
sorted_age.down %>%
  select(participantID, age) %>% 
  distinct() %>% 
  head()

	participantID	age
1	4	54
2	79	49
3	44	44
4	3	42
5	23	35
6	37	33

#Arrange data by DECREASING order of the age column could also be written using the desc() command.

sorted_age.down2 <- arrange(data, desc(age)) 
sorted_age.down2 %>%
  select(participantID, age) %>% 
  distinct() %>% 
  head()

	participantID	age
1	4	54
2	79	49
3	44	44
4	3	42
5	23	35
6	37	33

group_by

# Grouping doesn't change how the data looks; it changes how it acts with the other dplyr verbs:
# To removing grouping, use ungroup

bmi_cohort <- data %>% 
  group_by(cohort) %>% 
  mutate(median_bmi_cohort= median(BMI))

# If you want to check which (if any) variables are grouped

group_vars(bmi_cohort)

[1] "cohort"

# Best practice is to ungroup when finished (to prevent unexpected results later)

bmi_cohort <- data %>% 
  group_by(cohort) %>% 
  mutate(median_bmi_cohort= median(BMI)) %>%
  ungroup()

group_vars(bmi_cohort)

character(0)

bmi_cohort %>%
  select(cohort, median_bmi_cohort) %>% 
  arrange(cohort) %>% 
  distinct()

	cohort	median_bmi_cohort
1	Healthy	25.7
2	IBS-C	23.3
3	IBS-D	24.3

summarise

summarize the data in terms of aspects such as the mean, median, sum or maximum # Look at the abundance of each phage family, by cohort

phage_abundance <- data %>% 
 filter(Family=="Demerecviridae" | Family == "Herelleviridae" | Family == "Microviridae" | Family == "Myoviridae" | Family == "Podoviridae" | Family == "Siphoviridae") %>%
  group_by(cohort, Family) %>% 
  summarise(sum = sum(Abundance))

phage_abundance %>%
  select(cohort, Family, sum) %>% 
  arrange(cohort)

	cohort	Family	sum
1	Healthy	Demerecviridae	0
2	Healthy	Herelleviridae	0
3	Healthy	Microviridae	78298.
4	Healthy	Myoviridae	19595.
5	Healthy	Podoviridae	126930.
6	Healthy	Siphoviridae	9100.
7	IBS-C	Demerecviridae	0
8	IBS-C	Herelleviridae	3897.
9	IBS-C	Microviridae	480349.
10	IBS-C	Myoviridae	34366.
11	IBS-C	Podoviridae	66623.
12	IBS-C	Siphoviridae	16253.
13	IBS-D	Demerecviridae	2129
14	IBS-D	Herelleviridae	0
15	IBS-D	Microviridae	1005366.
16	IBS-D	Myoviridae	22490.
17	IBS-D	Podoviridae	254507.
18	IBS-D	Siphoviridae	16511.

tally or count

wrapper for summarise that will either call n() or sum(n) depending on whether you're tallying for the first time, or re-tallying
# Look at the number of Species present for each family by cohort

total_abundance_tally <- data %>% 
  filter(Abundance >0)  %>% 
  droplevels() %>% 
  group_by(cohort, Family) %>% 
  tally()

total_abundance_tally %>%
  arrange(Family)

	cohort	Family	n
1	Healthy	Astroviridae	1
2	IBS-C	Bromoviridae	1
3	Healthy	Caliciviridae	2
4	IBS-D	Demerecviridae	2
5	IBS-C	Herelleviridae	1
6	Healthy	Inoviridae	2
7	Healthy	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	534
8	IBS-C	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	537
9	IBS-D	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	563

count = group_by + tally

total_abundance_count <- data %>% 
  filter(Abundance >0)  %>% 
  droplevels() %>% 
  count(cohort, Family)

total_abundance_count %>%
  arrange(Family)

	cohort	Family	n
1	Healthy	Astroviridae	1
2	IBS-C	Bromoviridae	1
3	Healthy	Caliciviridae	2
4	IBS-D	Demerecviridae	2
5	IBS-C	Herelleviridae	1
6	Healthy	Inoviridae	2
7	Healthy	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	534
8	IBS-C	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	537
9	IBS-D	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	563

# Notice tally returns a tibble; while count maintains a data.frame

class(total_abundance_tally)

[1] "grouped_df" "tbl_df" "tbl" "data.frame"

class(total_abundance_count)

[1] "data.frame"

class(data)

[1] "data.frame"

**BUT ** why do we get a count of Species instead of Families?

mystery <- data %>% 
  filter(Abundance >0)  %>% 
  filter(Family == "Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family")  %>% 
  droplevels() %>% 
  count(cohort, Species) %>% 
  arrange(Species)

Now n = number of positive samples for that species
head(mystery)

	cohort	Species	n
1	Healthy	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1000	4
2	IBS-C	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1000	4
3	IBS-D	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1000	2
4	Healthy	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1001	4
5	IBS-C	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1001	4
6	IBS-D	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family_undefined_Genus_dark_matter sp. cat1001	4

max(mystery$n)
4
min(mystery$n)
1

# Look at the number of samples for each Family by cohort

total_abundance_tally_fam <- data %>% 
  filter(Abundance >0)  %>% 
  droplevels() %>% 
  select(c(cohort, Family, Sample)) %>% # keep only variables to evaluate
  distinct() %>% # no duplicate entries
  group_by(cohort, Family) %>% 
  tally()

total_abundance_tally_fam %>%
  arrange(Family)

	cohort	Family	n
1	Healthy	Astroviridae	1
2	IBS-C	Bromoviridae	1
3	Healthy	Caliciviridae	2
4	IBS-D	Demerecviridae	2
5	IBS-C	Herelleviridae	1
6	Healthy	Inoviridae	2
7	Healthy	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	4
8	IBS-C	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	4
9	IBS-D	Kingdom_undefined_Phylum_undefined_Class_undefined_Order_undefined_Family	4

Now n = number of positive samples for that Family

rename

#rename column variable particpantID as participant_id

data_2 <- rename(data, participant_id = participantID) %>%
  select(participant_id, OTU:age)
head(data_2, 5)

participant_id	OTU	Sample	Abundance	cohort	age
1	22	sp1	22	373339.	IBS-D
2	38	sp1	38	221155.	IBS-D
3	44	sp1	44	214744.	IBS-C
4	63	sp1	63	213158.	IBS-C
5	23	sp1	23	207921.	IBS-D

join

join multiple data sets together into a single data frame

#make two small tables (table_x and table_y) with one variable shared between them (sequenceID) to show how to do joins

table_x <- tibble(x = 1:5, 'sequenceID' = 12:16, 'diagnosis' = "UC", 'Family' = "Astroviridae", 'Abundance' = x ^ 2)
table_x
table_x <- select(table_x, sequenceID:Abundance)
table_x

sequenceID	diagnosis	Family	Abundance
12	UC	Astroviridae	1
13	UC	Astroviridae	4
14	UC	Astroviridae	9
15	UC	Astroviridae	16
16	UC	Astroviridae	25

table_y <- tribble(
  ~sequenceID , ~sex,
  14, "Female",
  15, "Male",
  16, "Male",
  17, "Female")
table_y

sequenceID	sex
14	Female
15	Male
16	Male
17	Female

#inner join : The simplest type of join is the inner join. An inner join matches pairs of observations whenever their keys are equal and leaves out any unmatched rows (the key here is sequenceID). The most important property of an inner join is that unmatched rows are not included in the result. This is usually a bad choice for joins as you will lose important data.

table_xy_inner<-table_x %>% 
  inner_join(table_y, by = "sequenceID")

#also can be simplified simply as:
table_xy_inner<- inner_join(table_x, table_y, by = "sequenceID")

table_xy_inner

sequenceID	diagnosis	Family	Abundance	sex
14	UC	Astroviridae	9	Female
15	UC	Astroviridae	16	Male
16	UC	Astroviridae	25	Male

#outer joins: A left join keeps all observations in x. A right join keeps all observations in y. A full join keeps all observations in x and y. These joins work by adding an additional “virtual” observation to each table. This observation has a key that always matches (if no other key matches), and a value filled with NA.

left join

table_xy_left <- left_join(table_x, table_y, by = "sequenceID")
table_xy_left

sequenceID	diagnosis	Family	Abundance	sex
12	UC	Astroviridae	1	NA
13	UC	Astroviridae	4	NA
14	UC	Astroviridae	9	Female
15	UC	Astroviridae	16	Male
16	UC	Astroviridae	25	Male

right join

table_xy_right<- right_join(table_x, table_y, by = "sequenceID")
table_xy_right

sequenceID	diagnosis	Family	Abundance	sex
14	UC	Astroviridae	9	Female
15	UC	Astroviridae	16	Male
16	UC	Astroviridae	25	Male
17	NA	NA	NA	Female

full join

table_xy_full<- full_join(table_x, table_y, by = "sequenceID")
table_xy_full

sequenceID	diagnosis	Family	Abundance	sex
12	UC	Astroviridae	1	NA
13	UC	Astroviridae	4	NA
14	UC	Astroviridae	9	Female
15	UC	Astroviridae	16	Male
16	UC	Astroviridae	25	Male
17	NA	NA	NA	Female

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

3. Data Transformation in R (tidyverse)

Introduction

Topics

For these examples, begin by loading the required packages: (only tidyverse is required for these particular tutorials)

Then load the test data:

select

distinct

filter

mutate

arrange

group_by

summarise

tally or count

rename

join

droplevels

unite or separate

slice

bind

ggplot

aes

geom

facet

stats

theme

ggsave

filter/reorder

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Uh oh!

Clone this wiki locally