Remember to re-set the working directory and install the packages first if needed.
library(dplyr) # data handling
library(stringr) # string manipulation
library(ggplot2) # plots
library(parallel) # parallel computing
library(lme4) # mixed models
library(lmerTest) # mixed models
library(pander) # nice Markdown tables
library(tidyr) # spread and gather functions
library(gridExtra) # arranging ggplots
library(mgcv) # generalised additive models
library(ggbeeswarm) # beeswarm plots
library(brms) # Bayesian models for meta-analysis
library(Cairo) # for exporting the character 'alpha' via ggsave
# Using a database allows quick access to the whole PubMed dataset without loading the whole thing to memory.
# You can download the pubmed_gender_data.sqlite3 database here: https://osf.io/bt9ya/
pubmed_db <- src_sqlite("~/Desktop/Gender Pubmed mining project/data for analysis/pubmed_gender_data.sqlite3", create = F)
pubmed_sqlite <- tbl(pubmed_db, "papers") # information about each individual paper
journals_sqlite <- tbl(pubmed_db, "journals") # information about each journal
Importantly, this function discards all papers where we do not know the gender of every single author. This is conservative but greatly simplifies the test – we’d have to be careful how we incorporated information from papers where some author’s genders were unknown. Our failures to identify author gender are almost certainly randomly distributed across papers with respect to gender homophily, and so this omission makes our tests less sensitive but should not bias their outcomes.
The function returns all the data that we need to run the test on every journal for which we found more than minimum.n
suitable papers that were published in the last nYears
.
# This function converts dates into a different format, namely time before 'the present' in years.
# 'The present' is defined as 20/08/2016 which is the date that Luke downloaded a local copy of the Medline database.
convert.dates <- function(dates) {
dates <- as.numeric(as.Date(dates, "%d_%m_%Y")) -
as.numeric(as.Date("20_08_2016","%d_%m_%Y")) # Convert the date to days before the present
return(dates / 365.25) # Convert from days to years
}
retrieve.authorship.data <- function(journals,
minimum.n,
start.of.time.period,
end.of.time.period,
restrict.by.n.authors = NULL,
file.name,
over.write = FALSE){
# Only do the test if over.write = TRUE or there is no datafile in the working directory
if(over.write | !file.exists(file.name)){
# Declare variables to hold the output
journal.names <- rep(NA, 10^5)
count.data <- list()
counter <- 0
# Loop over all the journals, try to get the data, and if
# there is enough, store the data we need as a list entry in 'count.data'
for(i in 1:length(journals)){
# Get the relevant data for the focal journal out of the database using dplyr
# the gender.95 column contains author genders in a format like "MFU" -
# MFU means the first author was male, second female, and third was of unknown gender.
# Genders were only recorded as M or F if the given name is associated with one gender >=95% of the time.
focal <- pubmed_sqlite %>%
filter(journal == journals[i]) %>% # restrict to focal journal
select(gender.95, date) %>% # get these columns
collect(n = Inf) %>% # get the data into memory
filter(nchar(gender.95) > 1) # only keep papers with >1 author
# Potentially further restrict to 2-author papers, 3-author papers... or 5 or more author papers
if(!is.null(restrict.by.n.authors)) {
if(restrict.by.n.authors != 5) focal <- focal %>% filter(nchar(gender.95) == restrict.by.n.authors)
else if(restrict.by.n.authors == 5) focal <- focal %>% filter(nchar(gender.95) >= 5)
}
# Convert the data column into 'years before present', to the nearest day
focal$date <- convert.dates(focal$date)
# Restrict the analysis to a specified period of time. My logic is that if the gender ratio changes over time, it will give the false impression of homophily, as explained in Bergstrom et al.'s comment here: http://www.michaeleisen.org/blog/?p=1931
# For example for papers from the last one year in the dataset, set start.of.time.period = 1 and end.of.time.period = 0
# For papers from between 5 and 6 years ago, set start.of.time.period = 6 and end.of.time.period = 5
focal <- focal %>%
filter(date > (start.of.time.period * -1),
date < (end.of.time.period * -1)) %>%
select(-date)
# Proceed if the sample size is big enough
if(nrow(focal) >= minimum.n){
focal <- (focal %>% as.data.frame())[,1] # Get the gender column as a character vector
focal <- focal[!str_detect(focal, "U")] # to keep it simple, let's only include papers where we know all the authors' genders
# And if the sample size is STILL big enough...
if(length(focal) >= minimum.n){
# Make a summary of the number of M and F authors on each paper, and the gender of the first and last author
n.char <- nchar(focal)
counts <- data.frame(nMales = str_count(focal, "M"),
nFemales = str_count(focal, "F"),
firstF = substr(focal, 1, 1) == "F",
lastF = substr(focal, n.char, n.char) == "F")
rm(focal) # Explicitly discard 'focal', to reduce memory footprint
counter <- counter + 1 # Save the data and increment the counter
count.data[[counter]] <- counts
journal.names[counter] <- journals[i]
}
else rm(focal) # Explicitly discard 'focal' anyway
}
}
names(count.data) <- journal.names[!is.na(journal.names)] # Name each entry in the list with the title of the journal the data are from
saveRDS(count.data, file = file.name)
}
}
These two functions estimate \(\alpha\) for a set of papers’ author lists. The version find.alpha.CIs()
performs bootstrapping resampling of the dataset to obtain the 95% confidence limits on \(\alpha\). find.alpha()
calculates \(\alpha\) without bootstrapping, making it much faster (used later when we are calculating the null distribution of \(\alpha\)).
\(\alpha\) is defined as p - q, and is termed “the coefficient of homophily”. p is the probability that a randomly-chosen co-author of a male author is a man, and q is the probability that a randomly-chosen co-author of a female author is a man. Thus, positive \(\alpha\) means that men coauthor with men, and women coauthor with women, more often than expected under the null model that coauthorship are random with respect to gender. Negative \(\alpha\) means the opposite: men and women coauthor more often than expected.
For more information about \(\alpha\), see this working paper by Bergstrom et al. http://eigenfactor.org/gender/assortativity/measuring_homophily.pdf. Also, see the comments section of this blog post: http://www.michaeleisen.org/blog/?p=1931. Bergstrom et al. point out that \(\alpha\) is equivalent to the Peason correlation coefficient between the genders of authors on a paper.
# The objects nMales and nFemales should give the number of male and female authors on a set of papers
# The function is written assuming that all papers have two or more authors of known gender.
find.alpha <- function(nMales, nFemales){
# Number of authors per paper, minus one
nAuthors <- nMales + nFemales - 1
# If you were to pick a SECOND random author from each paper,
# after picking a MALE the first time, what is the probability that the author is male?
p.second.pick.is.male.M <- (nMales - 1) / nAuthors
# If you were to pick a SECOND random author from each paper,
# after picking a FEMALE the first time, what is the probability that the author is male?
p.second.pick.is.male.F <- nMales / nAuthors
# If the probability of picking a male was zero (or one) the first time,
# make sure it's still zero (or one) for the second pick
p.second.pick.is.male.M[nMales == 0] <- 0
p.second.pick.is.male.F[nMales == 0] <- 0
p.second.pick.is.male.M[nFemales == 0] <- 1
p.second.pick.is.male.F[nFemales == 0] <- 1
# Find average proportion of male authors across all the men (p),
# and average proportion of male authors across all the women (q). Subtract to get alpha
return(sum(nMales * p.second.pick.is.male.M / sum(nMales)) -
sum(nFemales * p.second.pick.is.male.F / sum(nFemales)))
}
# This version uses bootstrapping to also estimate the 95% confidence limits on alpha
# Note that papers with lots of authors are more likely to be picked, ensuring that
# each individual author is equally likely to end up in the sample. So if
# 2-author vs 10-author papers have different amounts of homophily, or
# different gender ratios, the CIs should not be biased
find.alpha.CIs <- function(nMales, nFemales, nBoots, correction = NULL){
counts <- data.frame(nMales = nMales, nFemales = nFemales)
num.papers <- nrow(counts)
num.authors <- rowSums(counts)
# Resample 'num.papers' papers with replacement, 'nBoots' times.
# Papers with lots of authors are more likely to be picked
boot.alpha <- counts %>%
sample_n(size = num.papers * nBoots,
replace = TRUE,
weight = num.authors) %>%
mutate(replicate = rep(1:nBoots, each = num.papers)) %>%
group_by(replicate) %>% # Calculate alpha on each random sample
summarise(alpha = find.alpha(nMales, nFemales)) %>% .$alpha
# Optionally, apply a correction to get alpha' (e.g. if alpha is expected to
# be -0.05 under the null, add 0.05 to get alpha')
if(!is.null(correction)){
boot.alpha <- boot.alpha - correction
alpha <- find.alpha(nMales, nFemales) - correction
}
else alpha <- find.alpha(nMales, nFemales)
# If there are any re-samples entirely male/female (usually v rare), they generate NaNs. Remove them
boot.alpha <- boot.alpha[!is.nan(boot.alpha)]
c(alpha, as.numeric(quantile(boot.alpha, probs = c(0.025, 0.975))))
}
# This version calculates the difference in prop. male co-authors between male and female first (or last) authors
# The is.female column is boolean, and identifies whether each paper's first (or last) author is female
find.alpha.specific.author <- function(nMales, nFemales, is.female){
# Find the proportion of male coauthors for each first (or last) author
nMales[!is.female] <- nMales[!is.female] - 1 # If first/last author is male, don't count him among coauthors
nFemales[is.female] <- nFemales[is.female] - 1 # If first/last author is female, don't count her among coauthors
prop.male <- nMales / (nMales + nFemales) # proportion of male coauthors
return(sum(prop.male[!is.female]) / length(prop.male[!is.female]) -
sum(prop.male[is.female]) / length(prop.male[is.female]))
}
# Same, but it also gets the 95% CIs by bootstrapping
# Here, the first authors (or last authors) are the unit of replication, and there is always 1 per paper.
# So, no need to sample papers with lots of authors more frequently
find.alpha.specific.author.CIs <- function(nMales, nFemales,
nBoots, is.female,
correction = NULL){
counts <- data.frame(nMales = nMales,
nFemales = nFemales,
is.female = is.female)
num.papers <- nrow(counts)
boot.alpha <- counts %>%
sample_n(size = num.papers * nBoots,
replace = TRUE) %>%
mutate(replicate = rep(1:nBoots, each = num.papers)) %>%
group_by(replicate) %>% # Calculate alpha on each random sample
summarise(alpha = find.alpha.specific.author(nMales, nFemales, is.female)) %>% .$alpha
# Optionally, apply a correction to get alpha'
if(!is.null(correction)){
boot.alpha <- boot.alpha - correction
alpha <- find.alpha.specific.author(nMales, nFemales, is.female) - correction
}
else alpha <- find.alpha.specific.author(nMales, nFemales, is.female)
# If any re-samples chanced to be entirely male/female, they generate NaNs. Remove them
boot.alpha <- boot.alpha[!is.nan(boot.alpha)]
c(alpha, as.numeric(quantile(boot.alpha, probs = c(0.025, 0.975))))
}
For each journal, the function calculates \(\alpha\), and its 95% confidence limits. It also generates a two-tailed p-value for the observed \(\alpha\), under the null hypothesis that authors assort randomly with respect to gender. Small p-values mean that the observed set of papers shows significantly more, or less, gender homophily than expected under the null.
There is a good reason to test whether \(\alpha\) is significantly greater (or lower) than expected under the null, rather than simply checking whether the 95% CIs on \(\alpha\) overlap zero. The reason is that the null expectation for \(\alpha\) is not centred on zero for small datasets. This issue was summed up clearly by Prof. Carl Bergstrom in a comment on a blog post (http://www.michaeleisen.org/blog/?p=1931). Prof. Bergstrom wrote:
“Consider a tiny field with four authors, two men and two women, and one paper in each of the six two-author combinations. Now if you pick a man author at random, you are twice likely to pick the man in one of the four man-woman papers as you are to pick a man in the man-man paper. Therefore p = 1/3. But if pick a randomly chosen woman, again you’re more likely to pick a man-woman combination than the one woman-woman combination, and q = 2/3. As a result, \(\alpha\) = -1/3 even though the authorships seem to be distributed without gender bias.”
So, \(\alpha\) is sensitive to the size of the dataset being tested (at least for very small datasets). Our solution is to simulate many fake datasets, where authors are distributed randomly with respect to gender, but the datasets are otherwise identical (in terms of numbers of authorships by men and women, the number of papers, and the distribution of authors per paper). This is what this function does, when calculating whether \(\alpha\) is greater/lower than expected under random assortment by gender.
The function also returns a ‘corrected’ version of \(\alpha\), which is simply \(\alpha\) minus the null expected value of \(\alpha\) for this particular dataset (calculated by our simulation). Because the null expected \(\alpha\) is generally close to zero, the corrected version of \(\alpha\) is usually almost identical. However, the corrected version can deviate a little (as much as 0.05) for the smallest datasets.
# - if a vector is entered for is.female, then the test calculates alpha for a specific authorship position (i.e. first or last); otherwise, it calculates alpha for all authorship positions
# - the 'nYears' argument can be used to restrict the data to papers published nYears before the present. I chose nYears = 1, because if the gender ratio of authors changes over time (which it does), it will generate spurious coauthorship gender homophily
# - 'minimum.n' refers to the number of papers - the function will quit without performing the test if we did not get enough papers that fit the criteria (i.e. papers published within NYears of the present, with at least 2 authors, where we know all authors' genders with >95% confidence)
gender.grouping.test.one.journal <- function(nMales, nFemales, is.female = NULL, journal, nBoots, nSimulations){
print(journal)
# First count number (and distribution) of authors that we will swap around
total.males <- sum(nMales)
total.females <- sum(nFemales)
authors.per.paper <- nMales + nFemales
# If we are looking at first or last authors, we will not be swapping them, so subtract accordingly
if(!is.null(is.female)){
num.male.focal.authors <- sum(!is.female)
num.female.focal.authors <- sum(is.female)
# Quit unless there is a mixture of male and female first/last authors
if(num.male.focal.authors == 0 | num.female.focal.authors == 0) return(NULL)
total.males <- total.males - num.male.focal.authors
total.females <- total.females - num.female.focal.authors
authors.per.paper <- authors.per.paper - 1
}
# Now, let's simulate the expected alpha under the null hypothesis that authors
# publish together randomly with respect to gender.
# Simulate 'nSimulations' fake datasets, which have the same number of papers,
# and the same distribution of author list lengths per papers, as the real data,
# but male and female authors are randomly grouped across papers.
# The code here is ugly, but is vectorised for speed. Essentially I generate
# num.authors * nSimulations random numbers between zero and one, convert them
# to ranks, and use these ranks as index positions in 'pasted.gender'. This
# allows all the random numbers to be generated in one call, saving time.
# Then, it's just a matter of prettifying tapply's output using unname and unlist
pasted.gender <- c(rep(1, total.males), rep(0, total.females))
num.papers <- length(nMales)
num.authors <- total.males + total.females
simulated.data <- data.frame(
gender = pasted.gender[unlist(unname(tapply(runif(num.authors * nSimulations),
rep(1:nSimulations, each = num.authors), rank)))],
paperID = rep(unlist(mapply(rep, 1:num.papers, each = authors.per.paper)), nSimulations),
replicate = rep(1:nSimulations, each = num.authors))
rm(pasted.gender)
# Calculate number of males and females on each simulated paper, in each replicate
simulated.data <- simulated.data %>%
group_by(replicate, paperID) %>%
summarise(nFemales = sum(gender == 0),
nMales = sum(gender == 1)) %>%
select(-paperID)
# Find alpha for each simulation replicate
# If we are doing ALL the author positions, use find.alpha()
if(is.null(is.female)){
simulated.alpha.under.null <- simulated.data %>%
group_by(replicate) %>%
summarise(alpha = find.alpha(nMales, nFemales)) %>% .$alpha
median.alpha.under.null <- median(simulated.alpha.under.null)
# Calculate the observed alpha and its 95% CIs, both with and
# without adjusting for the fact that the null expected alpha can be non-zero
observed.alpha <- find.alpha.CIs(nMales, nFemales, nBoots = nBoots)
observed.alpha.adjusted <- find.alpha.CIs(nMales,
nFemales, nBoots = nBoots,
correction = median.alpha.under.null)
}
else{ # Otherwise, use find.alpha.specific.author()
simulated.data$is.female <- rep(is.female, nSimulations) # Add the first/last authors back in
simulated.data$nMales[!simulated.data$is.female] <-
simulated.data$nMales[!simulated.data$is.female] + 1
simulated.data$nFemales[simulated.data$is.female] <-
simulated.data$nFemales[simulated.data$is.female] + 1
simulated.alpha.under.null <- simulated.data %>%
group_by(replicate) %>%
summarise(alpha = find.alpha.specific.author(nMales, nFemales, is.female)) %>% .$alpha
median.alpha.under.null <- median(simulated.alpha.under.null)
observed.alpha <- find.alpha.specific.author.CIs(
nMales, nFemales, is.female, nBoots = nBoots)
observed.alpha.adjusted <- find.alpha.specific.author.CIs(
nMales, nFemales, is.female, nBoots = nBoots, correction = median.alpha.under.null)
}
# The 2-tailed p-value is the proportion of null-simulated alpha values with
# a value at least as far from zero as the observed alpha
# Thus, a significant p-value means that the observed alpha is larger OR smaller than expected
two.tail.p.value <- sum(abs(simulated.alpha.under.null) >=
abs(observed.alpha[1])) / length(simulated.alpha.under.null)
# Return the pertinent results
data.frame(journal = journal,
n.useable.papers = num.papers,
n.authors = num.authors,
gender.ratio.of.sample = 100 * total.females /
(total.females + total.males),
observed.alpha = observed.alpha[1],
lower.95.CI = observed.alpha[2],
upper.95.CI = observed.alpha[3],
alpha.adjusted = observed.alpha.adjusted[1],
lower.95.CI2 = observed.alpha.adjusted[2],
upper.95.CI2 = observed.alpha.adjusted[3],
two.tail.p.value = two.tail.p.value)
}
Runs gender.grouping.test.one.journal()
on all the journals in ‘journals’, in parallel across multiple CPUs.
gender.grouping.test.many.journals <- function(count.data,
authors, output.file,
nBoots, nSimulations,
number.of.cores, over.write = FALSE){
# Only do the test if over.write = TRUE or there is no datafile in the working directory
if(over.write | !file.exists(output.file)){
journals <- names(count.data)
nJournals <- length(count.data)
# Set up a cluster
cl <- makePSOCKcluster(number.of.cores)
setDefaultCluster(cl)
clusterExport(NULL, c("find.alpha",
"find.alpha.CIs",
"find.alpha.specific.author",
"find.alpha.specific.author.CIs",
"gender.grouping.test.one.journal",
"pubmed_sqlite"))
clusterEvalQ(NULL, library(dplyr))
# Run the desired test on each journal, in parallel
if(authors == "all"){
results <- parLapply(NULL, 1:nJournals, function(i)
gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
count.data[[journals[i]]]$nFemales,
NULL,
journal = journals[i],
nBoots = nBoots,
nSimulations = nSimulations))
}
else if(authors == "first"){
results <- parLapply(NULL, 1:nJournals, function(i)
gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
count.data[[journals[i]]]$nFemales,
count.data[[journals[i]]]$firstF,
journal = journals[i],
nBoots = nBoots,
nSimulations = nSimulations))
}
else if(authors == "last"){
results <- parLapply(NULL, 1:nJournals, function(i)
gender.grouping.test.one.journal(count.data[[journals[i]]]$nMales,
count.data[[journals[i]]]$nFemales,
count.data[[journals[i]]]$lastF,
journal = journals[i],
nBoots = nBoots,
nSimulations = nSimulations))
}
setDefaultCluster(NULL) # close cluster and write the results to disk
stopCluster(cl = cl)
write.csv(do.call("rbind", results), file = output.file, row.names = FALSE)
}
}
Table S1: Sample sizes for the two datasets, which comprise papers published in the timeframes August 2005 - August 2006, and August 2015 - August 2016.
minimum.n <- 50 # Only include journals with at least 50 papers in the specified range of time
unique.journals <- pubmed_sqlite %>%
select(journal) %>% distinct() %>%
collect(n = Inf) %>% .$journal # Get a list of all the journals in the database
# Get all the data we need out of the database, and save as an RDS file - for papers in the last 12 months
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 1,
end.of.time.period = 0,
file.name = "data/last.year.rds",
over.write = FALSE)
# Get all the data we need out of the database, and save as an RDS file - for papers that are between 10 and 11 years old
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 11,
end.of.time.period = 10,
file.name = "data/older.papers.rds",
over.write = FALSE)
# Get all the data we need out of the database, and save as an RDS file - for just the 2-author papers in the last 12 months
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 1,
end.of.time.period = 0,
restrict.by.n.authors = 2,
file.name = "data/last.year_2author.rds",
over.write = FALSE)
# Get all the data we need out of the database, and save as an RDS file - for just the 3-author papers in the last 12 months
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 1,
end.of.time.period = 0,
restrict.by.n.authors = 3,
file.name = "data/last.year_3author.rds",
over.write = FALSE)
# Get all the data we need out of the database, and save as an RDS file - for just the 4-author papers in the last 12 months
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 1,
end.of.time.period = 0,
restrict.by.n.authors = 4,
file.name = "data/last.year_4author.rds",
over.write = FALSE)
# Get all the data we need out of the database, and save as an RDS file - for just the 5-plus-author papers in the last 12 months
retrieve.authorship.data(unique.journals,
minimum.n = minimum.n,
start.of.time.period = 1,
end.of.time.period = 0,
restrict.by.n.authors = 5, # NB this actually means "5 or more"
file.name = "data/last.year_5author.rds",
over.write = FALSE)
# Get metadata about each journal (research discipline, ISI impact factor, and the country that publishes it)
# see Holman et al. in prep, for information on how journals were assigned to disciplines
journal.data <- journals_sqlite %>% collect(n=Inf) %>%
rename(journal = short.title,
discipline = Discipline,
country = journal.country) %>%
select(journal, discipline, country, IF) %>%
as.data.frame %>%
distinct(journal, .keep_all = T)
make.sample.size.table <- function(filename){
count.data <- readRDS(paste("data/", filename, ".rds", sep = ""))
data.frame(Quantity = c("Number of disciplines",
"Number of journals",
"Number of papers",
"Number of authors",
"Median number of papers per journal",
"Median number of authors per journal",
"Median number of authors per paper"),
Value = c(journal.data %>% filter(journal %in% names(count.data)) %>%
select(discipline) %>%
distinct() %>% nrow(),
length(count.data),
nrow(do.call("rbind", count.data)[,1:2]),
sum(do.call("rbind", count.data)[,1:2]),
median(sapply(count.data, nrow)),
median(sapply(count.data, function(x) sum(x[,1]) + sum(x[,2]))),
median(rowSums(do.call("rbind", count.data)[,1:2]))))
}
if(!file.exists("manuscript/sample.size.table.csv")){
sample.size.table <- make.sample.size.table("older.papers") %>%
rename(`n (2005-2006)` = Value) %>%
left_join(make.sample.size.table("last.year") %>%
rename(`n (2015-2016)` = Value), by = "Quantity")
write.csv(sample.size.table, file = "manuscript/sample.size.table.csv", row.names = FALSE)
}
read.csv("manuscript/sample.size.table.csv") %>% pander(split.cell = 40, split.table = Inf)
Quantity | n..2005.2006. | n..2015.2016. |
---|---|---|
Number of disciplines | 101 | 107 |
Number of journals | 1192 | 2116 |
Number of papers | 151652 | 276879 |
Number of authors | 647634 | 1311213 |
Median number of papers per journal | 87 | 87 |
Median number of authors per journal | 371 | 413 |
Median number of authors per paper | 4 | 4 |
Here, ‘sufficient data’ means that there are at least 50 papers, for which there are 2+ authors, and we know all authors’ genders, and these papers were published in the last year. We calculate \(\alpha\) for three types of authors:
# Calculate alpha across all authorship positions for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
"all",
"data/homophily.results.all.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha for first authors for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
"first",
"data/homophily.results.first.authors.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha for last authors for all suitable NEW papers
gender.grouping.test.many.journals(readRDS("data/last.year.rds"),
"last",
"data/homophily.results.last.authors.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha across all authorship positions for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
"all",
"data/older.homophily.results.all.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha for first authors for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
"first",
"data/older.homophily.results.first.authors.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha for last authors for all suitable OLD papers
gender.grouping.test.many.journals(readRDS("data/older.papers.rds"),
"last",
"data/older.homophily.results.last.authors.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha across all authorship positions for all suitable NEW papers with TWO authors
gender.grouping.test.many.journals(readRDS("data/last.year_2author.rds"),
"all",
"data/homophily.results.2author.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha across all authorship positions for all suitable NEW papers with THREE authors
gender.grouping.test.many.journals(readRDS("data/last.year_3author.rds"),
"all",
"data/homophily.results.3author.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha across all authorship positions for all suitable NEW papers with FOUR authors
gender.grouping.test.many.journals(readRDS("data/last.year_4author.rds"),
"all",
"data/homophily.results.4author.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
# Calculate alpha across all authorship positions for all suitable NEW papers with FIVE authors
gender.grouping.test.many.journals(readRDS("data/last.year_5author.rds"),
"all",
"data/homophily.results.5author.csv",
nBoots = 1000,
nSimulations = 1000,
number.of.cores = 8,
over.write = FALSE)
Here, we assign extra information to each journal - research discipline (see Holman et al. 2018), ISI impact factor (where available), and the country that publishes the journal (where known; information taken from PubMed).
# Calculate the impact factor of each journal relative to other journals in its discipline (using residuals from a mixed model)
journal.data$logIF <- log10(journal.data$IF) # Take the log10 of the impact factor
journal.data$standardised.IF <- NA
# Calculate residual log10 IF for each journal (i.e. IF relative to the field that it is in)
journal.data$standardised.IF[!is.na(journal.data$logIF)] <-
resid(lmer(logIF ~ (1|discipline), data = journal.data[!is.na(journal.data$logIF), ]))
# Function to neatly merge the journal-specific data with the gender homophily results
# Also add another column of p-values, which are False Discovery Rate-corrected using B-H method
merge.and.clean <- function(homophily.filepath, journal.data){
left_join(read.csv(homophily.filepath, stringsAsFactors = FALSE),
journal.data,
by = "journal") %>%
select(discipline,
journal,
standardised.IF,
country,
n.useable.papers,
n.authors,
gender.ratio.of.sample,
observed.alpha,
lower.95.CI,
upper.95.CI,
alpha.adjusted,
lower.95.CI2,
upper.95.CI2,
two.tail.p.value) %>%
arrange(-n.useable.papers) %>% # Sort by sample size
mutate(adjusted.p.value = p.adjust(two.tail.p.value, method = "BH"))
}
# Load up all the homophily results, and merge them with the journal-specific information
gender.homophily.all <- merge.and.clean("data/homophily.results.all.csv", journal.data)
gender.homophily.first <- merge.and.clean("data/homophily.results.first.authors.csv", journal.data)
gender.homophily.last <- merge.and.clean("data/homophily.results.last.authors.csv", journal.data)
gender.homophily.all.old <- merge.and.clean("data/older.homophily.results.all.csv", journal.data)
gender.homophily.first.old <- merge.and.clean("data/older.homophily.results.first.authors.csv", journal.data)
gender.homophily.last.old <- merge.and.clean("data/older.homophily.results.last.authors.csv", journal.data)
# Bind all 5 datasets into a single one, with a column called 'Subset' to identify them
master.dataset <- rbind(gender.homophily.all,
gender.homophily.first,
gender.homophily.last,
gender.homophily.all.old,
gender.homophily.first.old,
gender.homophily.last.old) %>%
mutate(Subset = unlist(mapply(rep, c("All authors", "First authors", "Last authors"),
each = c(nrow(gender.homophily.all),
nrow(gender.homophily.first),
nrow(gender.homophily.last),
nrow(gender.homophily.all.old),
nrow(gender.homophily.first.old),
nrow(gender.homophily.last.old)))),
Age = unlist(mapply(rep, c("New", "Old"),
each = c(sum(nrow(gender.homophily.all),
nrow(gender.homophily.first),
nrow(gender.homophily.last)),
sum(nrow(gender.homophily.all.old),
nrow(gender.homophily.first.old),
nrow(gender.homophily.last.old)))))) %>%
select(discipline, journal, Subset, Age, standardised.IF, n.useable.papers, n.authors,
gender.ratio.of.sample, observed.alpha, lower.95.CI, upper.95.CI, alpha.adjusted,
lower.95.CI2, upper.95.CI2, two.tail.p.value, adjusted.p.value) %>%
rename(unadjusted.p.value = two.tail.p.value)
fig_s8 <- master.dataset %>%
ggplot(aes(x = observed.alpha, y = alpha.adjusted)) +
geom_abline(intercept = 0, slope = 1, colour = "darkgrey") +
geom_point(size=0.6, alpha=0.4) +
ylab("Adjusted measure of homophily (\u03B1')") +
xlab("Unadjusted measure of homophily (\u03B1)") +
theme_minimal()
ggsave(fig_s8, file = "figures/S8 Fig.pdf", height = 5, width = 5, device = cairo_pdf)
fig_s8
S8 Figure: There is a very strong correlation between the values of \(\alpha\) and \(\alpha'\) calculated for each journal, though in a handful of cases the difference is considerable. The deviation between \(\alpha\) and \(\alpha'\) is greatest for journals for which there is a small sample size (see S9 Fig).
As expected, unadjusted \(\alpha\) is slightly downwardly biased for small datasets, because the expected value of \(\alpha\) under the null is less than 0. Thus, our adjusted \(\alpha'\) statistic, which is the observed \(\alpha\) minus the value of \(\alpha\) expected under the null, tends to be substantially higher than \(\alpha\) for small datasets. All of our results do not differ qualitatively if we use the unadjusted \(\alpha\), but we have elected to present the results for \(\alpha'\) in light of the issues with unadjusted \(\alpha\) for small datasets.
fig_s9 <- master.dataset %>%
filter(Age == "New") %>%
ggplot(aes(x = log10(n.useable.papers),
y = alpha.adjusted - observed.alpha)) +
geom_point(alpha = 0.3, size = 0.5) +
facet_wrap(~Subset) +
ylab("\u03B1' - \u03B1") +
xlab("Log10 sample size (number of papers)") +
theme_minimal()
ggsave(fig_s9, file = "figures/S9 Fig.pdf", height = 5, width = 5, device = cairo_pdf)
fig_s9
S9 Figure: For journals for which we recovered a small number of papers (<100), the unadjusted metric \(\alpha\) was downwardly biased. This fits our expectations: because authors cannot be their own co-authors, small datasets will tend to produce negative estimates of \(\alpha\) even if authors assort randomly with respect to gender (see main text). This suggests that \(\alpha'\) is a more useful measure of homophily and heterophily, especially for small samples.
S1 Data: This file gives the \(\alpha'\) values calculated for each journal, in the 2005 and 2015 samples, and for each type of author (all authors, first authors, and last authors). The tables gives the impact factor of each journal, the sample size, \(\alpha\) and \(\alpha'\) and their 95% CIs, and the p-value from a 2-tailed test evaluating the null hypothesis that \(\alpha\) is zero (both raw and FDR-corrected p-values are shown).
# Save and export a neat and tidy version for the supplementary material
neat <- master.dataset
names(neat) <- gsub("[.]", " ", names(neat))
names(neat) <- gsub("Subset", "Author position", names(neat))
names(neat) <- paste(toupper(substring(names(neat), 1, 1)), substring(names(neat), 2), sep="")
names(neat) <- gsub("N", "n", names(neat))
names(neat) <- gsub("2", "", names(neat))
for(i in which(sapply(neat, is.numeric))) neat[,i] <- round(neat[,i], 3)
write.csv(neat, file = "supplement/S1 data.csv", row.names = FALSE)
rm(neat)
Most journals show gender homophily (i.e. \(\alpha' > 0\)), not heterophily, meaning that people are more likely to co-publish with colleagues of the same gender.
figure2A <- master.dataset %>%
filter(Age == "New") %>%
mutate(Significant = 1 * (adjusted.p.value < 0.05),
Significant = replace(Significant, Significant == 0, "No"),
Significant = replace(Significant, Significant == 1, "Yes"))
n.sig.homo <- with(figure2A, sum(Significant[Subset == "All authors"] == "Yes" &
alpha.adjusted[Subset == "All authors"] > 0))
n.sig.hetero <- with(figure2A, sum(Significant[Subset == "All authors"] == "Yes" &
alpha.adjusted[Subset == "All authors"] < 0))
n.journals <- sum(figure2A$Subset == "All authors")
figure2A <- figure2A %>%
ggplot(aes(x = alpha.adjusted, fill = Significant)) +
geom_vline(xintercept = 0, linetype = 2) +
geom_area(binwidth = 0.01, colour = "black", stat = "bin") +
scale_fill_manual(values = c("skyblue", "white")) +
xlab("Coefficient of homophily (\u03B1')") +
ylab("Number of journals") +
facet_wrap(~Subset, scales = "free_y", ncol=1) +
theme_minimal(12) +
theme(legend.position = c(0.9, 0.96))
figure2A
Figure 2A: See the legend for Figure 2 below.
Table S2: Number of journals showing statistically significant homophily or heterophily, in two one-year periods. The significance threshold was p < 0.05, and p-values were adjused using Benjamini-Hochberg false discovery rate correction. Note that the power of our test is lower for the 2005-2006 data because fewer papers were recovered per journal: thus, it is not meaningful to compare the % significant journals (i.e. 11% vs 24%) between the two time periods.
data.frame(Year = c("2015-2016", "2005-2006"),
rbind(master.dataset %>%
filter(adjusted.p.value < 0.05, Age == "New") %>%
mutate(homophily = as.logical(ceiling(alpha.adjusted))) %>%
.$homophily %>% table() %>% unname(),
master.dataset %>%
filter(adjusted.p.value < 0.05, Age == "Old") %>%
mutate(homophily = as.logical(ceiling(alpha.adjusted))) %>%
.$homophily %>% table() %>% unname()),
Total.n = c(sum(master.dataset$Age == "New" & master.dataset$Subset == "All authors"),
sum(master.dataset$Age == "Old" & master.dataset$Subset == "All authors"))) %>%
rename(`Significant heterophily` = X1, `Significant homophily` = X2, `Year range` = Year,
`Total n journals` = Total.n) %>%
pander(split.cell = 40, split.table = Inf)
Year range | Significant heterophily | Significant homophily | Total n journals |
---|---|---|---|
2015-2016 | 2 | 1472 | 2116 |
2005-2006 | 1 | 391 | 1192 |
Although the effect is weak, we show here that the average \(\alpha\) across journals was lower in 2005-2006 than it was in 2015-2016. One possible reason is that researchers have become more likely to preferentially select same-gendered colleagues over time. Another hypothesis is an increase in the Wahlund effect with time: for example, journals might have become more multi-disciplinary, or started publishing papers from a more diverse range of countries.
# For each journal where we have observations of alpha' at
# both time points, calculate the change in alpha'
old.vs.new <- master.dataset %>%
filter(Subset == "All authors") %>%
group_by(journal) %>%
summarise(alpha.old = alpha.adjusted[2],
alpha.new = alpha.adjusted[1],
SE.old = (upper.95.CI2[2] - lower.95.CI2[2]) / (2 * 1.96),
SE.new = (upper.95.CI2[1] - lower.95.CI2[1]) / (2 * 1.96),
lower883.old = alpha.old - SE.old * 1.190,
upper883.old = alpha.old + SE.old * 1.190,
lower883.new = alpha.new - SE.new * 1.190,
upper883.new = alpha.new + SE.new * 1.190,
Significant = "No",
Significant = replace(Significant, lower883.new > upper883.old, "Yes"),
Significant = replace(Significant, lower883.old > upper883.new, "Yes")) %>%
filter(!is.na(alpha.old))
fig_s2 <- old.vs.new %>%
ggplot(aes(x = alpha.new - alpha.old, fill = Significant)) +
geom_vline(xintercept = 0, linetype = 2) +
geom_histogram(binwidth = 0.01, colour = "black", fill = "skyblue") +
scale_fill_manual(values = c("skyblue", "white")) +
xlab("Change in \u03B1' over the last ten years") +
ylab("Number of journals") +
theme_minimal()
ggsave(fig_s2, file = "figures/S2 Fig.pdf", width = 5, height = 5, device = cairo_pdf)
fig_s2
S2 Figure: Histogram showing the distribution of differences in \(\alpha'\) between the 2015-16 and 2005-6 samples, where positive numbers indicate an increase in \(\alpha'\) with time. The mean is slightly positive (i.e. 0.004), indicating a mild increase in average \(\alpha'\) with time.
Journals had slightly lower values of \(\alpha'\) ten years ago than they do today. The difference in mean \(\alpha'\) values has an effect size of Cohen’s d = 0.09, which is considered small. The effect is statistically significant, since we have a large sample of journals. The model includes ‘journal’ and ‘discipline’ as random effects, preventing pseudoreplication caused by measuring the same journals twice (i.e. once in each time period), and allowing us to test whether the discipline of the journal explains any variance in \(\alpha'\). Both random factors explain only around 1% of the variance.
# Note that the response variable gets scaled, to make the effect size interpretable as Cohen's d
summary(lmer(scale(alpha.adjusted) ~ Age + (1|journal) + (1|discipline),
weights = 1 / ((upper.95.CI2 - lower.95.CI2)/(2*1.96)), # Weight each observation of alpha by 1/SE
data = master.dataset %>%
filter(Subset == "All authors", journal %in% old.vs.new$journal)))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: scale(alpha.adjusted) ~ Age + (1 | journal) + (1 | discipline)
## Data: master.dataset %>% filter(Subset == "All authors", journal %in%
## old.vs.new$journal)
## Weights: 1/((upper.95.CI2 - lower.95.CI2)/(2 * 1.96))
##
## REML criterion at convergence: 5063.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.1327 -0.5435 0.0794 0.6395 4.4672
##
## Random effects:
## Groups Name Variance Std.Dev.
## journal (Intercept) 0.05494 0.2344
## discipline (Intercept) 0.05106 0.2260
## Residual 22.96801 4.7925
## Number of obs: 1918, groups: journal, 959; discipline, 100
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) -0.09029 0.03955 100.71979 -2.283 0.0245 *
## AgeOld -0.09080 0.03751 953.24228 -2.421 0.0157 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr)
## AgeOld -0.430
When we compared the \(\alpha'\) values of journals from different research disciplines, we found that the strength of homophily varied between disciplines, but not a lot. That is, the variance within disciplines was substantially greater than the variance between disciplines (see above linear mixed model: discipline explains about 4% of the variance in \(\alpha'\)).
# There is plenty of homophily, and basically no heterophily at all.
# The disciplines that contain more diverse topics (e.g. "Multidisciplinary") show most homophily, as expected.
# Still, there is homophily in many comparatively narrow disciplines too.
disc.summary <- master.dataset %>%
filter(!is.na(alpha.adjusted),
Age == "New",
Subset == "All authors") %>%
group_by(discipline) %>%
summarise(Mean.gender.ratio = mean(gender.ratio.of.sample) %>% round(1),
Mean.adjusted.alpha = mean(alpha.adjusted) %>% round(3),
SE.of.adjusted.alpha = (sd(alpha.adjusted)/sqrt(length(alpha.adjusted))) %>% round(3),
Median.adjusted.alpha = median(alpha.adjusted) %>% round(3),
n.journals = n(),
n.significant.homophily = sum(alpha.adjusted > 0 & unadjusted.p.value < 0.05),
n.significant.heterophily = sum(alpha.adjusted < 0 & unadjusted.p.value < 0.05),
n.FDR.significant.homophily = sum(alpha.adjusted > 0 & adjusted.p.value < 0.05),
n.FDR.significant.heterophily = sum(alpha.adjusted < 0 & adjusted.p.value < 0.05)) %>%
arrange(-Mean.adjusted.alpha) %>%
mutate(n.significant.homophily = paste(n.significant.homophily,
" (", n.FDR.significant.homophily, ")", sep = ""),
n.significant.heterophily = paste(n.significant.heterophily,
" (", n.FDR.significant.heterophily, ")", sep = ""),
discipline = factor(discipline, levels = discipline)) %>%
select(-n.FDR.significant.homophily, -n.FDR.significant.heterophily) %>%
rename(Discipline = discipline) %>%
as.data.frame()
disc.summary$n.significant.homophily[disc.summary$n.significant.homophily == "0 (0)"] <- 0
disc.summary$n.significant.heterophily[disc.summary$n.significant.heterophily == "0 (0)"] <- 0
names(disc.summary) <- gsub("[.]", " ", names(disc.summary))
figure2B <- gender.homophily.all %>%
filter(discipline %in% disc.summary$Discipline[disc.summary$`n journals` > 10]) %>%
mutate(Discipline = factor(discipline, levels = rev(disc.summary$Discipline))) %>%
ggplot(aes(x = Discipline, y = alpha.adjusted)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_quasirandom(aes(colour = Discipline), alpha = 0.3, bandwidth = 0.25) +
geom_boxplot(outlier.size = 0.5, outlier.shape = NA, fill = NA) +
ylab(NULL) +
coord_flip() +
theme_minimal() +
theme(legend.position = "none")
grid.arrange(figure2A + xlab(NULL), figure2B, nrow = 1, bottom = "Coefficient of homophily (\u03B1')")
ggsave(grid.arrange(figure2A + xlab(NULL), figure2B, nrow = 1, bottom = "Coefficient of homophily (\u03B1')"), filename = "figures/Fig2.pdf", height = 8, width = 7, device = cairo_pdf)
Figure 2: Of the 2116 journals for which we had adequate data in 2015-2016, 825 showed statistically significant evidence of homophily (denoted by \(\alpha' > 0\)), and 1 showed statistically significant evidence of heterophily (\(\alpha' < 0\)), after adjusting p-values using Benjamini-Hochberg false discovery rate correction. In the left panels, the white area shows the number of journals for which homophily was significantly stronger than expected under the null hypothesis (p < 0.05), while the blue area shows the remainder. Patterns were similar whether \(\alpha'\) was calculated for all authors, for first authors only, or for last authors only. The right panel shows the distribution of the \(\alpha'\) values across journals, split by the research discipline of the journal. The points show the \(\alpha'\) for individual journals, and the boxes show quartiles 1, 2 and 3. The boxplot whiskers show the distance of the farthest point that is within 1.5 times the interquartile range from the edge of the box.
S4 Data: The table shows the distribution of the \(\alpha'\) values across journals, split by the research discipline. The gender ratio column shows the percentage of women authors in the sample used to calculate \(\alpha'\), across all authorship positions. In the last two columns, the numbers outside parentheses give the number of journals that deviate statistically significantly from zero, while the numbers inside parentheses give the number that remain significant after false discovery rate correction.
s4_data <- disc.summary %>% filter(`n journals` > 9)
write.csv(s4_data, file = "supplement/S4 data.csv", row.names = FALSE)
s4_data %>% pander(split.cell = 40, split.table = Inf)
Discipline | Mean gender ratio | Mean adjusted alpha | SE of adjusted alpha | Median adjusted alpha | n journals | n significant homophily | n significant heterophily |
---|---|---|---|---|---|---|---|
Urology | 19.4 | 0.12 | 0.02 | 0.101 | 19 | 13 (11) | 0 |
Public Health | 53.3 | 0.104 | 0.007 | 0.1 | 36 | 24 (18) | 0 |
Nursing | 77.4 | 0.103 | 0.018 | 0.102 | 24 | 11 (9) | 0 |
Radiology | 30.5 | 0.1 | 0.016 | 0.101 | 12 | 9 (8) | 0 |
Nutritional Sciences | 54.4 | 0.099 | 0.009 | 0.101 | 33 | 23 (20) | 0 |
Dentistry | 37.7 | 0.093 | 0.014 | 0.107 | 23 | 12 (9) | 0 |
Unclassified | 36.5 | 0.091 | 0.009 | 0.083 | 92 | 43 (34) | 0 |
Vascular Diseases | 32.2 | 0.091 | 0.011 | 0.083 | 14 | 10 (8) | 0 |
Nephrology | 35.7 | 0.088 | 0.017 | 0.097 | 14 | 7 (6) | 0 |
Traumatology | 28.8 | 0.088 | 0.022 | 0.08 | 18 | 10 (9) | 0 |
Anesthesiology | 34.6 | 0.087 | 0.016 | 0.078 | 17 | 9 (9) | 0 |
Biotechnology | 37.6 | 0.087 | 0.016 | 0.08 | 19 | 8 (7) | 0 |
Medicine | 39.8 | 0.087 | 0.004 | 0.085 | 343 | 190 (155) | 1 (1) |
Physiology | 37.6 | 0.086 | 0.011 | 0.081 | 34 | 22 (18) | 0 |
Communicable Diseases | 44.2 | 0.085 | 0.018 | 0.072 | 13 | 9 (6) | 0 |
Otolaryngology | 31.6 | 0.085 | 0.012 | 0.087 | 11 | 6 (3) | 0 |
Cardiology | 24.4 | 0.083 | 0.007 | 0.079 | 61 | 35 (32) | 0 |
Multidisciplinary | 35.4 | 0.083 | 0.011 | 0.081 | 22 | 17 (14) | 0 |
Pediatrics | 50.5 | 0.081 | 0.008 | 0.084 | 56 | 29 (21) | 0 |
Neoplasms | 43.3 | 0.078 | 0.007 | 0.07 | 75 | 40 (38) | 0 |
Pharmacology | 43.3 | 0.078 | 0.008 | 0.075 | 57 | 27 (25) | 0 |
Geriatrics | 49 | 0.077 | 0.013 | 0.093 | 11 | 7 (6) | 0 |
Orthopedics | 17.9 | 0.077 | 0.01 | 0.078 | 21 | 10 (8) | 0 |
Psychology | 47.3 | 0.077 | 0.01 | 0.069 | 51 | 9 (8) | 0 |
Gynecology | 51.8 | 0.076 | 0.012 | 0.073 | 16 | 8 (7) | 0 |
Transplantation | 33.8 | 0.074 | 0.031 | 0.046 | 10 | 4 (4) | 0 |
Chemistry Techniques, Analytical | 34.7 | 0.073 | 0.008 | 0.079 | 19 | 10 (7) | 0 |
Toxicology | 42.7 | 0.073 | 0.011 | 0.067 | 24 | 14 (10) | 0 |
Endocrinology | 45.4 | 0.072 | 0.008 | 0.07 | 26 | 15 (11) | 0 |
Pulmonary Medicine | 39.4 | 0.071 | 0.013 | 0.065 | 14 | 8 (5) | 0 |
Pathology | 42.7 | 0.07 | 0.02 | 0.055 | 10 | 3 (3) | 0 |
Hematology | 41.3 | 0.069 | 0.016 | 0.061 | 14 | 7 (6) | 0 |
Rheumatology | 42.9 | 0.069 | 0.017 | 0.081 | 12 | 9 (8) | 0 |
Diagnostic Imaging | 29.3 | 0.068 | 0.026 | 0.055 | 11 | 4 (3) | 0 |
General Surgery | 25.3 | 0.068 | 0.006 | 0.068 | 73 | 38 (32) | 0 |
Gastroenterology | 33.3 | 0.067 | 0.012 | 0.06 | 28 | 17 (13) | 0 |
Neurology | 38 | 0.067 | 0.006 | 0.065 | 108 | 51 (40) | 1 (0) |
Environmental Health | 38.2 | 0.065 | 0.006 | 0.062 | 22 | 10 (8) | 0 |
Genetics | 42.3 | 0.065 | 0.011 | 0.058 | 19 | 10 (8) | 0 |
Biophysics | 28.6 | 0.059 | 0.016 | 0.06 | 10 | 5 (4) | 0 |
Psychiatry | 48.7 | 0.059 | 0.007 | 0.053 | 44 | 11 (9) | 0 |
Biology | 36.8 | 0.058 | 0.011 | 0.047 | 32 | 12 (9) | 0 |
Microbiology | 44.3 | 0.057 | 0.007 | 0.054 | 35 | 10 (6) | 0 |
Biochemistry | 39.3 | 0.055 | 0.007 | 0.06 | 48 | 21 (17) | 0 |
Chemistry | 29.7 | 0.055 | 0.006 | 0.054 | 39 | 18 (14) | 0 |
Molecular Biology | 37.5 | 0.055 | 0.013 | 0.041 | 23 | 7 (5) | 0 |
Dermatology | 47.1 | 0.053 | 0.01 | 0.041 | 24 | 8 (4) | 0 |
Ophthalmology | 33.1 | 0.052 | 0.007 | 0.054 | 35 | 10 (5) | 0 |
Allergy and Immunology | 44 | 0.051 | 0.013 | 0.047 | 13 | 4 (3) | 0 |
Botany | 38.5 | 0.045 | 0.012 | 0.04 | 18 | 4 (2) | 0 |
Brain | 40.7 | 0.045 | 0.015 | 0.052 | 12 | 4 (3) | 0 |
Biomedical Engineering | 25.7 | 0.04 | 0.013 | 0.045 | 12 | 3 (2) | 0 |
Cell Biology | 42.1 | 0.039 | 0.01 | 0.047 | 35 | 11 (8) | 0 |
Immunology | 43.3 | 0.036 | 0.014 | 0.031 | 20 | 6 (4) | 1 (0) |
Substance-Related Disorders | 50.4 | 0.036 | 0.008 | 0.029 | 10 | 1 (0) | 0 |
Veterinary Medicine | 48.4 | 0.034 | 0.01 | 0.038 | 17 | 1 (1) | 0 |
brms
model used to derive the means in Figure 3Note that the 95% credible intervals for the population-level intercept do not overlap zero, which indicates “significant” positive \(\alpha'\) among 2-author papers. Furthermore, \(\alpha'\) is significantly higher for 3-, 4-, and 5-plus-author papers than for 2-author papers. The \(R^2\) of the model is 0.44, and the random effects appear to explain non-trivial amounts of variation.
cat(readLines('data/brms.output.txt'), sep = '\n')
## Family: gaussian
## Links: mu = identity; sigma = identity
## Formula: alpha.adjusted | se(SE) ~ authors + (authors | journal)
## Data: alpha_by_author_number (Number of observations: 1307)
## Samples: 4 chains, each with iter = 6000; warmup = 4000; thin = 1;
## total post-warmup samples = 8000
## ICs: LOO = NA; WAIC = NA; R2 = 0.44
##
## Group-Level Effects:
## ~journal (Number of levels: 844)
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## sd(Intercept) 0.09 0.01 0.08 0.11 2381 1.00
## sd(authors3) 0.08 0.01 0.06 0.10 2084 1.00
## sd(authors4) 0.07 0.01 0.05 0.09 1408 1.00
## sd(authors5P) 0.08 0.01 0.07 0.10 1619 1.00
## cor(Intercept,authors3) -0.84 0.06 -0.94 -0.71 3133 1.00
## cor(Intercept,authors4) -0.86 0.05 -0.94 -0.74 2692 1.00
## cor(authors3,authors4) 0.85 0.08 0.66 0.97 1102 1.00
## cor(Intercept,authors5P) -0.94 0.01 -0.96 -0.91 2008 1.00
## cor(authors3,authors5P) 0.88 0.05 0.76 0.97 713 1.00
## cor(authors4,authors5P) 0.93 0.04 0.82 0.99 290 1.02
##
## Population-Level Effects:
## Estimate Est.Error l-95% CI u-95% CI Eff.Sample Rhat
## Intercept 0.04 0.01 0.02 0.06 3625 1.00
## authors3 0.03 0.01 0.01 0.05 4297 1.00
## authors4 0.02 0.01 0.01 0.04 4218 1.00
## authors5P 0.03 0.01 0.01 0.04 3752 1.00
##
## Samples were drawn using sampling(NUTS). For each parameter, Eff.Sample
## is a crude measure of effective sample size, and Rhat is the potential
## scale reduction factor on split chains (at convergence, Rhat = 1).
Journals with a male-biased author gender ratio showed less author gender homophily, while journals with an even or female-biased author gender ratio showed more homophily. The pattern was similar whether we calculated \(\alpha\) for all authors, first authors only, or last authors only.
gender.ratio <- master.dataset %>%
filter(Age == "New") %>%
ggplot(aes(x = (gender.ratio.of.sample - 50), y = alpha.adjusted)) +
geom_hline(yintercept = 0, linetype = 2) +
geom_vline(xintercept = 0, linetype = 2) +
geom_point(alpha = 0.4, size = 0.7) +
stat_smooth(method = "gam",
formula = y ~ s(x, k = 3),
colour = "tomato", fill = "skyblue") +
xlab("% women authors (relative to gender parity)") +
ylab("Coefficient of homophily (\u03B1')") +
facet_wrap(~Subset) +
theme_minimal() +
theme(legend.position = "none")
ggsave(plot = gender.ratio, filename = "figures/S4 Fig.pdf",
device = cairo_pdf, width=7, height=5)
gender.ratio
S4 Figure: There is a weakly positive, non-linear relationship between the gender ratio of authors publishing in a journal, and the coefficient of homophily (\(\alpha'\)). Specifically, journals with 50% women authors or higher tended to have more same-sex coauthorships than did journals with predominantly men authors. This relationship held whether \(\alpha'\) was calculated for all authors, first authors only, or last authors only. A negative value on the x axis denotes an excess of men authors, a positive value denotes an excess of women authors, and zero denotes gender parity. The lines were fitted using generalised additive models with the smoothing parameter \(k\) set to 3.
Here are the results of three generalised additive models (GAMs) fit to the data in the three panels of Figure 4:
gam(alpha.adjusted ~ s(gender.ratio.of.sample), data = master.dataset %>%
filter(Age == "New", Subset == "All authors") %>%
mutate(gender.ratio.of.sample = gender.ratio.of.sample - 50)) %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## alpha.adjusted ~ s(gender.ratio.of.sample)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.075314 0.001356 55.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(gender.ratio.of.sample) 2.12 2.704 7.367 0.000169 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.00907 Deviance explained = 1.01%
## GCV = 0.0038954 Scale est. = 0.0038897 n = 2116
gam(alpha.adjusted ~ s(gender.ratio.of.sample), data = master.dataset %>%
filter(Age == "New", Subset == "First authors") %>%
mutate(gender.ratio.of.sample = gender.ratio.of.sample - 50)) %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## alpha.adjusted ~ s(gender.ratio.of.sample)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.089800 0.001939 46.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(gender.ratio.of.sample) 2.657 3.381 11.34 8.8e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.0174 Deviance explained = 1.87%
## GCV = 0.0079719 Scale est. = 0.0079581 n = 2116
gam(alpha.adjusted ~ s(gender.ratio.of.sample), data = master.dataset %>%
filter(Age == "New", Subset == "Last authors") %>%
mutate(gender.ratio.of.sample = gender.ratio.of.sample - 50)) %>% summary()
##
## Family: gaussian
## Link function: identity
##
## Formula:
## alpha.adjusted ~ s(gender.ratio.of.sample)
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.084219 0.002131 39.53 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(gender.ratio.of.sample) 4.189 5.226 1.726 0.131
##
## R-sq.(adj) = 0.00337 Deviance explained = 0.534%
## GCV = 0.0096252 Scale est. = 0.0096015 n = 2115
country.histo <- country.data %>%
mutate(Significant = 1 * (p.adjusted < 0.05),
Significant = replace(Significant, Significant == 0, "No"),
Significant = replace(Significant, Significant == 1, "Yes")) %>%
ggplot(aes(x = alpha.adjusted, fill = Significant)) +
geom_vline(xintercept = 0, linetype = 2) +
geom_histogram(binwidth = 0.02, colour = "black") +
scale_fill_manual(values = c("skyblue", "white")) +
xlab("Coefficient of homophily (\u03B1')") +
ylab("Number of journals") +
theme_minimal() +
theme(legend.position = "top")
ggsave(country.histo, file = "figures/S5 Fig.pdf", width = 5, height = 5, device = cairo_pdf)
country.histo
S5 Figure: Histogram of \(\alpha'\) for 325 unique combinations of journal and country, using data from August 2015 - August 2016. The white areas denote combinations for which \(\alpha'\) differs significantly from zero (p < 0.05, following false discovery rate correction).
Using the country data, we can estimate whether the Wahlund effect created by gender differences between countries produced ‘spurious’ positive \(\alpha'\) values in our dataset.
For journal \(i\), we take the average of the differences \(\alpha'_i - \alpha'_{i,j}\), where \(\alpha'_i\) is the coefficient of homophily calculated for all papers from journal \(i\), and \(\alpha'_{i,j}\) is the coefficient of homophily calculated for papers from journal \(i\), by authors from country \(j\). A positive value implies that pooling authors from different countries gives a higher estimate of homophily, consistent with a Wahlund effect.
Plotting a histogram of these average differences for the 285 journals shows that the mean and median difference is essentially zero, illustrating that \(\alpha'\) is not strongly inflated by Wahlund effects caused by differences in author gender ratio between countries.
mean.inflation.due.to.country <- country.data %>%
left_join(master.dataset %>%
filter(Age == "New", Subset == "All authors") %>%
select(journal, alpha.adjusted) %>%
rename(journal.alpha = alpha.adjusted),
by = "journal") %>%
group_by(journal) %>%
summarise(mean.inflation.due.to.country = mean(journal.alpha - alpha.adjusted))
median <- median(mean.inflation.due.to.country$mean.inflation.due.to.country)
txt <- data.frame(x = median + 0.06, y = 66, label = paste("Median =", round(median,3)))
mean.inflation.due.to.country <- mean.inflation.due.to.country %>%
ggplot(aes(mean.inflation.due.to.country)) +
geom_histogram(binwidth = 0.02, colour = "black", fill = "skyblue") +
geom_vline(xintercept = median, linetype = 2) +
geom_text(data=txt, aes(x=x, y=y, label=label)) +
theme_minimal() +
xlab("Estimated inflation in \u03B1' due to\ndifferences in gender ratio between countries") +
ylab("Frequency")
ggsave(mean.inflation.due.to.country, file = "figures/S7 Fig.pdf", width = 5, height = 5, device = cairo_pdf)
mean.inflation.due.to.country
S7 Figure: Histogram showing the estimated degree to which \(\alpha'\) is inflated by inter-country differences in author gender ratio, across the 285 journals for which we had adequate data after restricting the analysis by country. The average inflation in \(\alpha'\) is negligible, suggesting that Wahlund effects resulting from inter-country differences have a neglible effect on our estimates of gender homophily.
In response to a reviewer query, we here calculate the value of \(\alpha\) across all the PubMed journals in our dataset, across the whole \(c.\) 15 year time period. After removing papers with at least one author of unknown gender as well as single-author papers, we were left with 3,709,136 papers in total. \(\alpha\) was 0.126, which is higher than the median for individual journals, consistent with the arguments presented in Figure 1 (i.e. that combining gender data across fields and time periods inflates \(\alpha\) via the Wahlund effect).
author_counts_for_whole_of_Pubmed <-
pubmed_sqlite %>% # Access the big SQLite database
select(gender.95) %>% # Get the column of genders (to 95% certainty, based on genderize.io database)
filter(gender.95 != "U", # Remove papers with author lists like this (cannot search strings on the database side)
gender.95 != "UU",
gender.95 != "UUU",
gender.95 != "UUUU",
gender.95 != "UUUUU",
gender.95 != "F", # no single-author papers either
gender.95 != "M") %>%
collect(n = Inf) %>% # Bring all the remaining data from the SQL database into R
filter(!str_detect(gender.95, "U")) %>% # Remove all papers with any unknown-gender authors
filter(nchar(gender.95) > 1) %>% # Remove single-author papers
mutate(nFemales = str_count(gender.95, "F"), # Count the men and women on each remaining paper
nMales = str_count(gender.95, "M")) %>%
select(-gender.95)
# Number of papers in this dataset: 3,709,136
# Number of authors on these papers: 16,100,616
print(paste("The value of alpha for the whole of Pubmed is:",
round(find.alpha(nMales = author_counts_for_whole_of_Pubmed$nMales,
nFemales = author_counts_for_whole_of_Pubmed$nFemales), 3)))
## [1] "The value of alpha for the whole of Pubmed is: 0.126"
tibble(
`Number of journal-country combinations` = nrow(country.data),
`Median number of papers per journal` = median(country.data$n.useable.papers),
`Median number of authors per journal` = median(country.data$n.authors)
) %>% as.data.frame() %>% pander(split.cell = 40, split.table = Inf)
Number of journal-country combinations | Median number of papers per journal | Median number of authors per journal |
---|---|---|
325 | 70 | 273 |
summary(lme4::lmer(alpha.adjusted ~ (1|journal) + (1|country), weights = 1/SE, data = country.data %>%
mutate(SE = (upper.95.CI2 - alpha.adjusted)/1.96) %>% filter(upper.95.CI2 > alpha.adjusted)))
## Linear mixed model fit by REML ['lmerMod']
## Formula: alpha.adjusted ~ (1 | journal) + (1 | country)
## Data:
## country.data %>% mutate(SE = (upper.95.CI2 - alpha.adjusted)/1.96) %>%
## filter(upper.95.CI2 > alpha.adjusted)
## Weights: 1/SE
##
## REML criterion at convergence: -804.7
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -2.40090 -0.51421 -0.07867 0.47754 2.37229
##
## Random effects:
## Groups Name Variance Std.Dev.
## journal (Intercept) 0.001823 0.04270
## country (Intercept) 0.001207 0.03474
## Residual 0.069918 0.26442
## Number of obs: 325, groups: journal, 285; country, 23
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.08335 0.01033 8.072
paste("% variance associated with country:", round(100 * 0.001069 / (0.001069 + 0.001787 + 0.070926), 2))
## [1] "% variance associated with country: 1.45"
paste("% variance associated with journal:", round(100 * 0.001787 / (0.001069 + 0.001787 + 0.070926), 2))
## [1] "% variance associated with journal: 2.42"
x <- nrow(country.data[country.data$p.adjusted < 0.05, ])
fig_s6 <- country.data %>%
filter(p.adjusted < 0.05) %>%
filter(discipline != "Multidisciplinary") %>%
arrange(lower.95.CI2) %>%
mutate(y = paste(country, journal, sep = " - "),
y = (factor(y, y))) %>%
ggplot(aes(alpha.adjusted, y, colour = y)) +
geom_errorbarh(aes(xmin = lower.95.CI2, xmax = upper.95.CI2), height = 0) +
geom_point() +
xlab("Coefficient of homophily (\u03B1') \u00B1 95% CIs") +
ylab("Country - journal combination") +
theme_minimal() +
theme(legend.position = "none")
ggsave(fig_s6, file = "figures/S6 Fig.pdf", width = 7, height = 9, device = cairo_pdf)
fig_s6
S6 Figure: Plot showing the 68 combinations of journal and author country of affiliation for which \(\alpha'\) is significantly higher than expected.
This section makes Figure 5, by deriving the expected \(\alpha\) under various assumptions, assuming that there is (potentially) a gender gap between career stages but no direct choice of collaborators by gender. The model has four parameters:
GR.stage1
in the code below (this parameter is plotted on the x-axis of Figure 5)GR.stage2
in the code below (this parameter is plotted on the y-axis of Figure 5)prop_between_career_stages
in the code below (this parameter is plotted in the columns of Figure 5)prop_within_stage_between_two_stage1s
in the code below (this parameter is plotted in the rows of Figure 5)Note that we do not need to define a parameter for the proportion of within-career stage co-authorships that are between two people at career stage 2; this is already known, since it is defined as 1 - prop_within_stage_between_two_stage1s
.
Given these 4 parameters, we can easily calculate the proportion of the three possible types of collaborations with respect to career stage: stage1 with stage1, stage1 with stage2, and stage2 with stage2. See step 1 in the code.
Next, we need to calculate the proportion of collaborator pairs that are male-male, female-female, and male-female. Since we have defined the frequencies of each collaboration type with respect to career stage, as well as the gender ratios at each career stage, we can calculate the frequency of each gender combination. See step 2 in the code.
Finally, we calculate \(\alpha = p - q\), where \(p\) and \(q\) are the proportions of men and women, respectively, whose coauthor is male (step 3 in the code).
run_career_stage_model <- function(params){
# Attach the vectors of parameters from the dataframe input
# GR = gender ratio, so GR.stage1 means gender ratio among career stage 1 ("early-career", e.g. PhD students)
# and GR.stage1 means gender ratio among career stage 2 ("established", e.g. professors)
GR.stage1 <- params$GR.stage1
GR.stage2 <- params$GR.stage2
# Gere are the proportions of collaborations of each of the three types:
# between career stages, between two stage 1 researchers, and between two stage 2 researchers
prop_between_career_stages <- params$prop_between_career_stages
prop_within_career_stages <- 1 - prop_between_career_stages
prop_within_stage_between_two_stage1s <- params$prop_within_stage_between_two_stage1s
prop_within_stage_between_two_stage2s <- 1 - prop_within_stage_between_two_stage1s
# Step 1. Calculate the proportions of each type of collaboration:
# early-early, early-established, established-established respectively
p1.1 <- prop_within_career_stages * prop_within_stage_between_two_stage1s
p1.2 <- prop_between_career_stages
p2.2 <- prop_within_career_stages * prop_within_stage_between_two_stage2s
# Step 2
# Define helper functions to calculate the proportion of MM, FF, and MF collaborations.
# Given the gender ratio at career stages a and b, termed GR_a, GR_b, find the
# proportion of collaborations that are man-man, woman-woman, or mixed gender
man.man <- function(GR_a, GR_b) (1 - GR_a) * (1 - GR_b)
woman.woman <- function(GR_a, GR_b) GR_a * GR_b
mixed <- function(GR_a, GR_b) (1 - GR_a) * GR_b + (1 - GR_b) * GR_a
# Find the frequencies of the 3 kinds of co-author pairs w.r.t. gender, i.e. MM, FF, and MF
man.man <- p1.1 * man.man(GR.stage1, GR.stage1) +
p1.2 * man.man(GR.stage1, GR.stage2) +
p2.2 * man.man(GR.stage2, GR.stage2)
woman.woman <- p1.1 * woman.woman(GR.stage1, GR.stage1) +
p1.2 * woman.woman(GR.stage1, GR.stage2) +
p2.2 * woman.woman(GR.stage2, GR.stage2)
mixed <- p1.1 * mixed(GR.stage1, GR.stage1) +
p1.2 * mixed(GR.stage1, GR.stage2) +
p2.2 * mixed(GR.stage2, GR.stage2)
# Step 3: Find p and q, which are used to calculate alpha, as p - q:
# p is proportion of men coauthoring with men
p <- 1 - (mixed / (2 * man.man + mixed))
# q is proportion of women coauthoring with men
q <- mixed / (2 * woman.woman + mixed)
# Return the calculated value of alpha, as well as the four parameters used to calculate it
data.frame(alpha = p - q,
GR.stage1,
GR.stage2,
prop_between_career_stages,
prop_within_stage_between_two_stage1s)
}
# Now run the model on a large parameter space
results <- expand.grid( # Define the parameter space for the model
GR.stage1 = seq(0.01, 0.7, length = 201),
GR.stage2 = seq(0.01, 0.7, length = 201),
prop_between_career_stages = c(0.01, 0.25, 0.5, 0.75, .99),
prop_within_stage_between_two_stage1s = c(0.01, 0.25, 0.5, 0.75, .99)) %>%
filter(GR.stage1 >= GR.stage2) %>% # We assume the % women is equal or lower in career 1 than in career stage 2
run_career_stage_model() %>% # run the model
mutate(GR.stage1 = round(GR.stage1 * 100), # Convert proportions to percentages
GR.stage2 = round(GR.stage2 * 100),
prop_between_career_stages = prop_between_career_stages * 100,
prop_within_stage_between_two_stage1s = prop_within_stage_between_two_stage1s * 100) %>%
filter(!duplicated(paste(GR.stage1, GR.stage2, prop_between_career_stages, prop_within_stage_between_two_stage1s)))
# Plot the results to make figure 5
# NB that I edited the top- and right-side labels in Inkscape afterwards, as it was tricky to do it entirely in R.
model_figure <- results %>%
ggplot(aes(GR.stage1, GR.stage2, z = round(alpha,2), fill = alpha)) +
facet_grid(prop_between_career_stages ~ prop_within_stage_between_two_stage1s) +
geom_tile() +
geom_abline(colour = "grey20", size = 0.3) +
stat_contour(breaks = c(-5:-1, 1:5)/10, colour = "grey20", size = 0.3) +
scale_fill_distiller(name = "\u03B1",
palette = "RdYlBu",
limits = c(-1,1) * max(abs(results$alpha))) +
scale_x_continuous(expand = c(0,0), limits = c(0,70))+
scale_y_continuous(expand = c(0,0), limits = c(0,70))+
xlab("% women among early-career authors") +
ylab("% women among established authors") +
theme_bw() +
theme(panel.grid.minor = element_blank(),
strip.background = element_blank(),
legend.position = "bottom") +
ggtitle("Rows show % between-stage collaborations ",
subtitle = "Columns show % within-stage collaborations that involve two early-career researchers")
model_figure
ggsave(model_figure, file = "figures/Fig5.pdf",
height = 5.6, width = 5, device = cairo_pdf)
Figure 5: When the gender ratio of early-career researchers is not equal to the gender ratio among established researchers, the null expectation for \(\alpha\) is not necessarily zero. Specifically, if most collaborations occur between career stages, there will be an excess of mixed-gender collaborations (\(\alpha < 0\), blue areas), while if most collaborator pairs comprise two people at the same career stage, there will be an excess of same-gender collaborations (\(\alpha > 0\), red areas). However, the conditions required for strong gender homophily (i.e. the red areas) are quite restrictive, making it unlikely that this issue can fully explain the homophily observed in our study. Additionally, in research disciplines where between-career stage collaboration is common and there is a shortage of women among established researchers (i.e. the blue areas), our study will underestimate the strength of gender homophily. Contour lines denote increments of 0.1.
sessionInfo() %>% pander()
R version 3.5.1 (2018-07-02)
Platform: x86_64-apple-darwin15.6.0 (64-bit)
locale: en_AU.UTF-8||en_AU.UTF-8||en_AU.UTF-8||C||en_AU.UTF-8||en_AU.UTF-8
attached base packages: parallel, stats, graphics, grDevices, utils, datasets, methods and base
other attached packages: knitr(v.1.20), Cairo(v.1.5-9), brms(v.2.8.0), Rcpp(v.1.0.0), ggbeeswarm(v.0.6.0), mgcv(v.1.8-24), nlme(v.3.1-137), gridExtra(v.2.3), tidyr(v.0.8.2), pander(v.0.6.2), lmerTest(v.3.0-1), lme4(v.1.1-17), Matrix(v.1.2-14), ggplot2(v.3.1.0), stringr(v.1.3.1) and dplyr(v.0.8.0.1)
loaded via a namespace (and not attached): matrixStats(v.0.54.0), xts(v.0.11-0), bit64(v.0.9-7), RColorBrewer(v.1.1-2), threejs(v.0.3.1), rprojroot(v.1.3-2), rstan(v.2.18.2), numDeriv(v.2016.8-1), tools(v.3.5.1), backports(v.1.1.2), R6(v.2.4.0), DT(v.0.4), vipor(v.0.4.5), DBI(v.1.0.0), lazyeval(v.0.2.1), colorspace(v.1.3-2), withr(v.2.1.2), prettyunits(v.1.0.2), tidyselect(v.0.2.5), processx(v.3.2.1), Brobdingnag(v.1.2-5), bit(v.1.1-14), compiler(v.3.5.1), cli(v.1.0.1), shinyjs(v.1.0), labeling(v.0.3), colourpicker(v.1.0), scales(v.1.0.0), dygraphs(v.1.1.1.6), mvtnorm(v.1.0-8), ggridges(v.0.5.0), callr(v.2.0.4), StanHeaders(v.2.18.0), digest(v.0.6.18), minqa(v.1.2.4), rmarkdown(v.1.10), base64enc(v.0.1-3), pkgconfig(v.2.0.2), htmltools(v.0.3.6), dbplyr(v.1.2.2), htmlwidgets(v.1.2), rlang(v.0.3.1), RSQLite(v.2.1.1), shiny(v.1.2.0), zoo(v.1.8-3), crosstalk(v.1.0.0), gtools(v.3.8.1), inline(v.0.3.15), magrittr(v.1.5), loo(v.2.1.0), bayesplot(v.1.5.0), munsell(v.0.5.0), abind(v.1.4-5), stringi(v.1.3.1), yaml(v.2.2.0), MASS(v.7.3-50), pkgbuild(v.1.0.2), plyr(v.1.8.4), blob(v.1.1.1), grid(v.3.5.1), promises(v.1.0.1), crayon(v.1.3.4), miniUI(v.0.1.1.1), lattice(v.0.20-35), splines(v.3.5.1), ps(v.1.3.0), pillar(v.1.3.1.9000), igraph(v.1.2.1), markdown(v.0.8), shinystan(v.2.5.0), reshape2(v.1.4.3), stats4(v.3.5.1), rstantools(v.1.5.0), glue(v.1.3.0.9000), evaluate(v.0.11), nloptr(v.1.0.4), httpuv(v.1.4.5), gtable(v.0.2.0), purrr(v.0.3.1), assertthat(v.0.2.0), mime(v.0.6), xtable(v.1.8-3), coda(v.0.19-2), later(v.0.7.5), rsconnect(v.0.8.8), tibble(v.2.0.99.9000), shinythemes(v.1.1.1), memoise(v.1.1.0), beeswarm(v.0.2.3) and bridgesampling(v.0.4-0)