library(tidyverse)
library(lme4)

####load data

setwd("G:\\Shared drives\\CHAGL\\CHAGL\\13 R analysis") #enter your working directory here and save data file to it
fulldf <- read.csv('chagl_public_data.csv')

#explanation of variables
#subject: a unique code denoting the subject
#group: four levels, PAC, RAC, PSC, RSC (Progressive Affix Control, Regressive Affix Control, Progressive Stem Control, etc.)
#trial_num: trial number
#correct: a binary variable indicating if the subject answered correctly for that trial. 1 indicates correct, 0 indicates incorrect.
#correct_answer: the correct answer in string form
#first: the nonword that appeared first in the 2AFC pair for that trial
#second: the nonword that appeared second in the 2AFC pair for that trial
#selected_item: the item the subject selected that trial
#nits: number of intervening syllables between target and trigger; ranges 0 - 3
#direction: progressive or regressive
#locus: stem control or affix control
#language: the subject's native language (English or Spanish)
#item: a numeric variable coding the lexical item for that trial. Both /s/ and /sh/ variants of an item are included under a single item number.
#biphone.prob: the biphone probability of the correct item, according to Vitevitch & Luce (2004)
#phoneme.prob: the phonemic probability of the correct item, according to Vitevitch & Luce (2004)

#center the variables Trial & nits and subset the data

english <- subset(fulldf, language =="English") %>%
  mutate(trial.c = scale(trial_num),
         nits.c = scale(nits))

spanish <- subset(fulldf, language == "Spanish") %>%
  mutate(trial.c = scale(trial_num),
         nits.c = scale(nits))

#########################################
#Models
###########################################

#model for Study 1: 
#binomial logistic regression 
#with predictors group, number of intervening transparent syllables (nits - centered), and trial (centered)
#as well as predictor of biphone probability and interactions of group and trial and of nits and trial
#and response variable of 'correct'
#with random slope for subject

engmod  <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob +  (1 + trial.c + nits.c|subject) + (1|item), 
                   data=english, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                           optCtrl=list(maxfun=2e5)))
summary(engmod)

#add predicted values back to dataframe
english$y_mod_glmer <- fitted(engmod)

################################
#Model for Study 2

########
#binomial logistic regression 
#with predictors group, number of intervening transparent syllables (nits - centered), and trial (centered)
#as well as predictors of biphone probability and phoneme probability 
#and interactions of group and trial and of nits and trial
#and response variable of 'correct'
#with random slope for subject

spanmod <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob + phoneme.prob + (1 + trial.c|subject) + (1|item), 
                   data=spanish, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                           optCtrl=list(maxfun=2e5)))
summary(spanmod)

#add predicted values back to dataframe
spanish$y_mod_glmer <- fitted(spanmod)

#################################################
#Figures
#################################################

#Figure 1: plot by learner success, English learners

#calculate percent correct by subject

engsum <- english %>%
  group_by(subject, group) %>%
  summarise(correct = mean(correct, na.rm = TRUE))

#add a variable for success

engsum <- engsum %>%
  mutate(success = ifelse(correct>=0.6,"Successful Learner", "Non-learner"))

#count successful learners and transfer counts to dataframe 'engsum'

engsmallsum <- engsum %>%
  group_by(success, group) %>%
  summarise(count = length(unique(subject)))

engsmallsum <- engsmallsum %>%
  mutate(ID = paste(group, success, sep = " "))

engsum <- engsum %>%
  mutate(ID = paste(group, success, sep = " "))

engsum$count <- engsmallsum$count[match(engsum$ID,engsmallsum$ID)]

#plot Figure 1

fig1 <- ggplot(engsum)+
  aes(group, correct, fill=factor(success))+
  geom_boxplot(position = position_dodge()) +  
  theme_classic() +
  scale_fill_brewer(palette = "Accent")  +
  ggtitle("Proportion Correct by Group and Learner Sucess") +
  #coord_cartesian(ylim=c(0, 1)) +
  theme(text = element_text(size=14)) +
  annotate("text", x = .8, y = .475, label = "13") +
  annotate("text", x = 1.2, y = .83, label = "13") +
  annotate("text", x = 1.8, y = .505, label = "13") +
  annotate("text", x = 2.2, y = .9, label = "13") +
  annotate("text", x = 2.8, y = .49, label = "15") +
  annotate("text", x = 3.2, y = .755, label = "10") +
  annotate("text", x = 3.8, y = .52, label = "16") +
  annotate("text", x = 4.2, y = .78, label = "9") +
  # geom_text(aes(label = count), vjust = 7, color = "black") +
  labs(x=NULL, y = "Proportion Correct", fill = NULL) +
  geom_jitter(aes(color = success), size = 2, shape = 2, width = .1) +
  guides(color = "none") +
  scale_color_manual(values = c("#497449", "#72687f"))
fig1

#Figure 2: English learners' performance over time


fig2 <- english %>%
  group_by(trial_num, group, language, locus, direction, subject) %>%
  summarise(correct = mean(correct, na.rm = TRUE), 
            fitted = mean(y_mod_glmer), na.rm = TRUE) %>%
  ggplot()+
  aes(x = trial_num, y = fitted, color = direction, linetype = locus)+
  scale_color_manual(values=c("#af7ac5", "#48c9b0"), labels=c("Progressive", "Regressive"))  +
  scale_linetype(labels=c("Affix-Controlled", "Stem-Controlled"))+
  theme_classic() +
  #  geom_point(alpha = 0.4)+
  coord_cartesian(ylim = c(0.4, 1))+
  annotate("text", x = 102, y = .715, label = "PAC", color = "gray20") +
  annotate("text", x = 102, y = .74, label = "PSC", color = "gray20") +
  annotate("text", x = 102, y = .675, label = "RAC", color = "gray20") +
  annotate("text", x = 102, y = .63, label = "RSC", color = "gray20") +
  labs(x="Trial Number", y = "Proportion Correct", color = "Direction", linetype = "Locus") +
  geom_smooth(method = "loess", size = 1.5, fill = "gray40")+
  guides(linetype = guide_legend(override.aes = list(color ="gray40"), linewidth = 0.7))
fig2


#Figure 3: proportion correction by group and TTD, English learners

engsum2 <- english %>%
  group_by(group, nits, direction, locus, language) %>%
  summarise(
    correct.sd = sd(correct),
    correct.se = sd(correct)/sqrt(n()),
    correct = mean(correct))

fig3 <- ggplot(engsum2)+
  aes(group, correct, fill=factor(nits))+
  geom_col(position = position_dodge()) +  
  geom_errorbar(aes(ymin = correct - correct.se,
                    ymax = correct + correct.se),
                width = 0.2, color = "gray40",
                position = position_dodge(0.9))+
  theme_classic() +
  scale_fill_brewer(palette = "Accent")  +
  ggtitle("Proportion Correct by Group and Number of Intervening \nTransparent Syllables") +
  coord_cartesian(ylim=c(0, 1)) +
  theme(text = element_text(size=14)) +
  labs(x=NULL, y = "Proportion Correct", fill = "Number of \nIntervening \nTransparent \nSyllables") 
fig3

#Figure 4: plot of learner success, Spanish learners

#calculate percent correct by subject

spansum <- spanish %>%
  group_by(subject, group) %>%
  summarise(correct = mean(correct, na.rm = TRUE))

#add a variable for success

spansum <- spansum %>%
  mutate(success = ifelse(correct>=0.6,"Successful Learner", "Non-learner"))

#count successful learners and transfer counts to dataframe 'spansum'

spansum <- spansum %>%
  mutate(ID = paste(group, success, sep = " "))

spansmallsum <- spansum %>%
  group_by(success, group) %>%
  summarise(correct = mean(correct, na.rm = TRUE), 
            count = length(unique(subject)))

#create figure

fig4 <- ggplot(spansum)+
  aes(group, correct, fill=factor(success))+
  geom_boxplot(position = position_dodge()) +  
  theme_classic() +
  scale_fill_brewer(palette = "Accent")  +
  ggtitle("Proportion Correct by Group and Learner Sucess") +
  #coord_cartesian(ylim=c(0, 1)) +
  theme(text = element_text(size=14)) +
  annotate("text", x = .8, y = .5, label = "12") +
  annotate("text", x = 1.2, y = .93, label = "13") +
  annotate("text", x = 1.8, y = .505, label = "18") +
  annotate("text", x = 2.2, y = .905, label = "8") +
  annotate("text", x = 2.8, y = .46, label = "10") +
  annotate("text", x = 3.2, y = .85, label = "15") +
  annotate("text", x = 3.8, y = .52, label = "12") +
  annotate("text", x = 4.2, y = .875, label = "13") +
  # geom_text(aes(label = count), vjust = 7, color = "black") +
  labs(x=NULL, y = "Proportion Correct", fill = NULL) +
  geom_jitter(aes(color = success), size = 2, shape = 2, width = .1) +
  guides(color = "none") +
  scale_color_manual(values = c("#497449", "#72687f"))
fig4

#Figure 5: Spanish learners' performance over time
fig5 <- spanish %>%
  group_by(trial_num, group, language, locus, direction, subject) %>%
  summarise(correct = mean(correct, na.rm = TRUE), 
            fitted = mean(y_mod_glmer), na.rm = TRUE) %>%
  ggplot()+
  aes(x = trial_num, y = fitted, color = direction, linetype = locus)+
  scale_color_manual(values=c("#af7ac5", "#48c9b0"), labels=c("Progressive", "Regressive"))  +
  scale_linetype(labels=c("Affix-Controlled", "Stem-Controlled"))+
  theme_classic() +
  #  geom_point(alpha = 0.4)+
  coord_cartesian(ylim = c(0.4, 1))+
  annotate("text", x = 102, y = .76, label = "RAC", color = "gray20") +
  annotate("text", x = 102, y = .705, label = "PAC", color = "gray20") +
  annotate("text", x = 102, y = .675, label = "RSC", color = "gray20") +
  annotate("text", x = 102, y = .63, label = "PSC", color = "gray20") +
  labs(x="Trial Number", y = "Proportion Correct", color = "Direction", linetype = "Locus") +
  geom_smooth(method = "loess", size = 1.5, fill = "gray40")+
  guides(linetype = guide_legend(override.aes = list(color ="gray40"), linewidth = 0.7))
fig5
#Figure 6: proportion correction by group and TTD, Spanish learners

spansum2 <- spanish %>%
  group_by(group, nits, direction, locus, language) %>%
  summarise(
    correct.sd = sd(correct),
    correct.se = sd(correct)/sqrt(n()),
    correct = mean(correct))

fig6 <- ggplot(spansum2)+
  aes(group, correct, fill=factor(nits))+
  geom_col(position = position_dodge()) +  
  geom_errorbar(aes(ymin = correct - correct.se,
                    ymax = correct + correct.se),
                width = 0.2, color = "gray40",
                position = position_dodge(0.9))+
  theme_classic() +
  scale_fill_brewer(palette = "Accent")  +
  ggtitle("Proportion Correct by Group and Number of Intervening \nTransparent Syllables") +
  coord_cartesian(ylim=c(0, 1)) +
  theme(text = element_text(size=14)) +
  labs(x=NULL, y = "Proportion Correct", fill = "Number of \nIntervening \nTransparent \nSyllables") 
fig6

###########################################
#rerun models with different reference levels of group
#sub-analysis
###########################################

#English data

#PSC as reference level

english$group <- relevel(factor(english$group), ref = "PSC")

engmod_psc  <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob +  (1 + trial.c + nits.c|subject) + (1|item), 
                     data=english, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(engmod_psc)

#RSC as reference level

english$group <- relevel(factor(english$group), ref = "RSC")

engmod_rsc  <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob +  (1 + trial.c + nits.c|subject) + (1|item), 
                     data=english, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(engmod_rsc)

#RAC as reference level

english$group <- relevel(factor(english$group), ref = "RAC")

engmod_rac  <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob +  (1 + trial.c + nits.c|subject) + (1|item), 
                     data=english, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(engmod_rac)

#spanish data
#PSC as reference level

spanish$group <- relevel(factor(spanish$group), ref = "PSC")

spanmod_psc <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob + phoneme.prob + (1 + trial.c|subject) + (1|item), 
                     data=spanish, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(spanmod_psc)

#PSC as reference level

spanish$group <- relevel(factor(spanish$group), ref = "RSC")

spanmod_rsc <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob + phoneme.prob + (1 + trial.c|subject) + (1|item), 
                     data=spanish, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(spanmod_rsc)

#RAC as reference level

spanish$group <- relevel(factor(spanish$group), ref = "RAC")

spanmod_rac <- glmer(correct ~ nits.c*trial.c + group * trial.c + biphone.prob + phoneme.prob + (1 + trial.c|subject) + (1|item), 
                     data=spanish, family = 'binomial', control=glmerControl(optimizer="bobyqa",
                                                                             optCtrl=list(maxfun=2e5)))
summary(spanmod_rac)