This is a worked set of answers to the ggplot course
First we are going to load the main tidyverse library.
library(tidyverse)
## -- Attaching packages ---------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.2.1 v purrr 0.3.3
## v tibble 2.1.3 v dplyr 0.8.3
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## -- Conflicts ------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
We’ll plot out the data in the weight_chart.txt
file. Let’s load it and look first.
read_tsv("weight_chart.txt") -> weight
## Parsed with column specification:
## cols(
## Age = col_double(),
## Weight = col_double()
## )
weight
We’ll start with a simple plot, just setting the minimum aesthetics.
weight %>%
ggplot(aes(x=Age, y=Weight)) +
geom_point()
Now we can customise this a bit by adding fixed aesthetics to the geom_point()
function.
weight %>%
ggplot(aes(x=Age, y=Weight)) +
geom_point(size=3, colour="blue2")
Now repeat but with a different geometry.
weight %>%
ggplot(aes(x=Age, y=Weight)) +
geom_line()
Finally, combine the two geometries.
weight %>%
ggplot(aes(x=Age, y=Weight)) +
geom_line()+
geom_point(size=3, colour="blue2")
Now let’s look at the chromosome_position_data.txt
file.
read_tsv("chromosome_position_data.txt") -> chr.data
## Parsed with column specification:
## cols(
## Position = col_double(),
## Mut1 = col_double(),
## Mut2 = col_double(),
## WT = col_double()
## )
head(chr.data)
We have the data in three separate columns at the moment so we need to use pivot_longer
to put them into a single column.
chr.data %>%
pivot_longer(cols=-Position, names_to = "sample", values_to = "value") -> chr.data
head(chr.data)
Now we can plot out a line graph of the position vs value for each of the samples. We’ll use colour to distiguish the lines for each sample.
chr.data %>%
ggplot(aes(x=Position, y=value, colour=sample)) +
geom_line(size=1)
Finally we’re going to look at the genome size vs number of chromosomes and colour it by domain in our genomes data.
read_csv("genomes.csv") -> genomes
## Parsed with column specification:
## cols(
## Organism = col_character(),
## Groups = col_character(),
## Size = col_double(),
## Chromosomes = col_double(),
## Organelles = col_double(),
## Plasmids = col_double(),
## Assemblies = col_double()
## )
head(genomes)
To get at the Domain
we’ll need to split apart the Groups field.
genomes %>%
separate(col=Groups, into=c("Domain","Kingdom","Class"), sep=";") -> genomes
head(genomes)
Now we can draw the plot.
genomes %>%
ggplot(aes(x=log10(Size),y=Chromosomes, colour=Domain)) +
geom_point()
We want a barplot of the lengths of samples in category A.
read_tsv("small_file.txt") -> small.file
## Parsed with column specification:
## cols(
## Sample = col_character(),
## Length = col_double(),
## Category = col_character()
## )
head(small.file)
Since there is only one measure per sample there is no summarisation to be done so we use geom_col
rather than geom_bar
.
small.file %>%
filter(Category=="A") %>%
ggplot(aes(x=Sample,y=Length)) +
geom_col()
Next we want a stripchart (geom_jitter
) of all of the lengths for each category. We need to use height=0
in the geom_jitter
to ensure that we don’t adjust the height of the points, only their width.
small.file %>%
ggplot(aes(x=Category, y=Length)) +
geom_jitter(height=0)
Whilst this worked it’s not very easy to tell the categories apart so we’ll tweak it to make it clearer.
small.file %>%
ggplot(aes(x=Category, y=Length, colour=Category)) +
geom_jitter(height=0, width=0.3, show.legend = FALSE, size=4)
Plot the distribution of expression values.
read_tsv("expression.txt") -> expression
## Parsed with column specification:
## cols(
## Gene = col_character(),
## Expression = col_double()
## )
head(expression)
Let’s try the plots in a couple of ways.
expression %>%
ggplot(aes(Expression)) +
geom_histogram(fill="yellow",colour="black")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
expression %>%
ggplot(aes(Expression)) +
geom_density(fill="yellow",colour="black")
We could also play around with the resolution in either of these plots.
Either increasing the resolution:
expression %>%
ggplot(aes(Expression)) +
geom_histogram(fill="yellow",colour="black", binwidth = 0.2)
..or decreasing it.
expression %>%
ggplot(aes(Expression)) +
geom_density(fill="yellow",colour="black", bw=2)
Plot the number of male deaths for all sites.
read_csv("cancer_stats.csv") -> cancer
## Parsed with column specification:
## cols(
## Class = col_character(),
## Site = col_character(),
## `Male Cases` = col_double(),
## `Female Cases` = col_double(),
## `Male Deaths` = col_double(),
## `Female Deaths` = col_double()
## )
head(cancer)
cancer %>%
ggplot(aes(x=Site, y=`Male Deaths`)) +
geom_col()
## Warning: Removed 5 rows containing missing values (position_stack).
We can’t see all of the labels as there isn’t enough space. We’ll fix this later, but for now let’s just show the 5 highest.
cancer %>%
arrange(desc(`Male Deaths`)) %>%
slice(1:5) %>%
ggplot(aes(x=Site, y=`Male Deaths`)) +
geom_col()
Now it works, but even though we fed it sorted data the plot still comes out in alphabetical order.
Plot the MutantRead distributions for good (QUAL==200) and bad (QUAL<200) variants.
read_csv("Child_Variants.csv", guess_max = 1000000) -> child
## Parsed with column specification:
## cols(
## CHR = col_character(),
## POS = col_double(),
## dbSNP = col_character(),
## REF = col_character(),
## ALT = col_character(),
## QUAL = col_double(),
## GENE = col_character(),
## ENST = col_character(),
## MutantReads = col_double(),
## COVERAGE = col_double(),
## MutantReadPercent = col_double()
## )
head(child)
We need to make the good/bad category column.
child %>%
mutate(`Good or not` = if_else(QUAL==200,"Good","Bad")) -> child
head(child)
Now we can plot it. I did it on a log scale to make it a bit easier to look at.
child %>%
ggplot(aes(x = `Good or not`, y=log2(MutantReads))) +
geom_violin(fill="yellow", colour="black")
Set a theme and then redraw some stuff to see that it changes.
theme_set(theme_bw(base_size=16))
child %>%
ggplot(aes(x = `Good or not`, y=log2(MutantReads))) +
geom_violin(fill="yellow", colour="black")
Yes, that definitely looks different, as will every plot from now on.
Redraw the previous bargraph but with the axes flipped so we can see all of the categories and we don’t have to filter them. I’m also going to order the results by the data to make the plot clearer, and I’ve removed the cancers which males can’t get.
cancer %>%
filter(!is.na(`Male Deaths`)) %>%
ggplot(aes(x=reorder(Site,`Male Deaths`), y=`Male Deaths`)) +
geom_col() +
xlab("Site")+
coord_flip()
Plot a scatterplot of brainweight vs bodyweight and make various customisations.
Put the title in the centre
Make the axes log scale
Colour by Category but using a ColorBrewer palette
Change the ordering of the categories
read_tsv("brain_bodyweight.txt") -> brain
## Parsed with column specification:
## cols(
## Species = col_character(),
## Category = col_character(),
## body = col_double(),
## brain = col_double()
## )
head(brain)
brain %>%
mutate(Category=factor(Category,levels=c("Domesticated","Wild","Extinct"))) %>%
ggplot(aes(x=brain, y=body, colour=Category))+
geom_point(size=4)+
ggtitle("Brain vs Body weight")+
xlab("Brainweight (g)") +
ylab("Bodyweight (kg)") +
scale_y_log10() +
scale_x_log10() +
scale_colour_brewer(palette = "Set1")
Finally do a barplot of all species showing their brainweight, but coloured by their bodyweight and using a custom colour scheme.
brain %>%
ggplot(aes(x=Species, y=brain, fill=log(body))) +
geom_col() +
coord_flip() +
scale_fill_gradientn(colours=c("blue2","purple", "green2","red2","yellow"))
Plot a stripchart with t boxplot overlay to summarise the data in the 4 categories.
read_csv("tidy_data1.csv") -> tidy1
## Parsed with column specification:
## cols(
## DMSO = col_double(),
## `TGX-221` = col_double(),
## PI103 = col_double(),
## Akt1 = col_double()
## )
tidy1
First we restructure the data
tidy1 %>%
pivot_longer(cols=everything(), names_to = "sample", values_to = "value") %>%
filter(!is.na(value)) -> tidy1
tidy1
Now we can do the plotting
tidy1 %>%
ggplot(aes(x=sample, y=value,colour=sample)) +
geom_boxplot(color="grey", size=2) +
geom_jitter(height=0, width=0.15, show.legend = FALSE, size=5)
We can do the same thing but just showing a mean bar instead of a full boxplot.
tidy1 %>%
ggplot(aes(x=sample, y=value,colour=sample)) +
stat_summary(geom="errorbar", fun.ymax = mean, fun.ymin=mean, colour="grey", size=2) +
geom_jitter(height=0, width=0.15, show.legend = FALSE, size=5)
Now we can plot the sample thing as a barplot.
tidy1 %>%
ggplot(aes(x=sample, y=value)) +
geom_bar(stat="summary", fill="yellow",color="grey", size=2) +
stat_summary(geom="errorbar", width=0.3, color="grey", size=2)
## No summary function supplied, defaulting to `mean_se()
## No summary function supplied, defaulting to `mean_se()
We could also have done the same thing using pre-calculated values. We’ll use the STDEV instead of the SEM.
tidy1 %>%
group_by(sample) %>%
summarise(mean=mean(value),stdev=sd(value)) -> tidy1.summary
tidy1.summary
tidy1.summary %>%
ggplot(aes(x=sample, y=mean, ymin=mean-stdev, ymax=mean+stdev)) +
geom_col(fill="yellow",color="grey", size=2) +
geom_errorbar(size=2, colour="grey", width=0.3)
Plot out a scatterplot of the two datasets against each other and customise the colouring.
read_tsv("up_down_expression.txt") -> up.down
## Parsed with column specification:
## cols(
## Gene = col_character(),
## Condition1 = col_double(),
## Condition2 = col_double(),
## State = col_character()
## )
head(up.down)
Let’s do a simple, uncustomised plot first.
up.down %>%
ggplot(aes(x=Condition1, y=Condition2, colour=State)) +
geom_point(size=0.5)
Now let’s improve the appearance and add some custom labels.
up.down %>%
filter(Condition1 > -1 & Condition2 > -1 & abs(Condition1 - Condition2) > 3) -> up.down.interesting
up.down.interesting
library(ggrepel)
up.down %>%
ggplot(aes(x=Condition1, y=Condition2, colour=State, label=Gene)) +
geom_point(size=1.5) +
scale_colour_manual(values=c("blue2","grey","red2")) +
theme(legend.position="none") +
geom_abline(slope = 1, intercept = 0, colour="darkgrey", size=1) +
geom_text_repel(data=up.down.interesting,col="black", box.padding = 1)
Clean up the data (restructure and remove NA values)
Draw a stripchart of cleanliness for males and females and facet by the day of the festival. Colour the males and females differently and add a line to show the mean values.
read_csv("DownloadFestival.csv") -> festival
## Parsed with column specification:
## cols(
## ticknumb = col_double(),
## gender = col_character(),
## day1 = col_double(),
## day2 = col_double(),
## day3 = col_double()
## )
head(festival)
festival %>%
pivot_longer(cols=starts_with("day"), names_to = "day", values_to = "cleanliness") %>%
filter(!is.na(cleanliness)) -> festival
head(festival)
Now we can plot it out.
festival %>%
ggplot(aes(x=gender, y=cleanliness, colour=gender)) +
geom_jitter(height=0, width=0.3, alpha=0.5, stroke=NA) +
scale_colour_manual(values = c("blue2","red2")) +
stat_summary(geom="errorbar", fun.y = mean, fun.ymax = mean, fun.ymin = mean, colour="darkgrey", size=3) +
facet_grid(cols=vars(day))
Finally we can draw the plot above but split by both day and attendance
festival %>%
group_by(ticknumb) %>%
count() %>%
right_join(festival) %>%
rename(attended=n) -> festival
## Joining, by = "ticknumb"
head(festival)
festival %>%
ggplot(aes(x=gender, y=cleanliness, colour=gender)) +
geom_jitter(height=0, width=0.3, alpha=0.5, stroke=NA) +
scale_colour_manual(values = c("blue2","red2")) +
stat_summary(geom="errorbar", fun.y = mean, fun.ymax = mean, fun.ymin = mean, colour="darkgrey", size=3) +
facet_grid(cols=vars(day), rows=vars(attended))