2) ProPerMSA_preparation.Rmd

# ProPer preparation (II): Praat-to-R

Collect data from Praat objects into an R dataframe.

```{r clean_start, warning=FALSE}
rm(list = ls())

## Load required libraries 
require(rPraat)
require(stringr)
require(dplyr)
```

# Harvest acoustic data

```{r pre_prepare_raw_df, warning=FALSE}
##########--- Read and collect Praat data into R tables ---########## 

##### Intensity 
# (get intensity data)
files_intensity <- list.files(path="praat_data/intensity_tiers/", pattern="*.IntensityTier",full.names=T)
intensity_df <- plyr::ldply(files_intensity, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  intensitier <- it.read(f)
  time = round(intensitier$t,3)*1000
  data.frame(file, t=as.integer(as.character(time)), intensity = round(intensitier$i,4))
})

##### Full-time 
# (get the full time of the file from the intensity tier)
fullTime_df <- plyr::ldply(files_intensity, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  filenameBits <- str_match(f,".*/([^/._]*)_([^/._]*)_([^/._]*)\\.[^/]*$")
  speaker <- filenameBits[,2]
  focus <- filenameBits[,3]
  trial <- filenameBits[,4]
  it <- it.read(f)
  time <- seq(it[["tmin"]], it[["tmax"]], 0.001) * 1000
  data.frame(file, t=as.integer(as.character(time)), speaker, focus, trial)
})

##### F0: Pitch Tier 
# (get the smooth F0 curve)
files_pitchTier <- list.files(path="praat_data/pitch_tiers/", pattern="*.PitchTier",full.names=T)
f0_smooth_df <- plyr::ldply(files_pitchTier, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  pt <- pt.read(f)
  time = round(pt[["t"]],3)*1000
  f0 = pt[["f"]]
  data.frame(file, t=as.integer(as.character(time)), f0_smooth=round(f0,2)) 
})

##### Pitch object  
# (get the Strength, i.e. the periodic fraction from Praat's autocorrelation, and frequency of selected pitch candidates: rowmax = highest strength value within the frequency range (up to 'pitch_ceiling'); row1 = Praat's path finder choice for F0 candidates
# (Note: this may take longer to process!!!)
files_pitchObject <- list.files(path="praat_data/pitch_objects/", pattern="*.Pitch",full.names=T)
pitchObject_df <- plyr::ldply(files_pitchObject, function(f){  
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2] 
  pitch_object <- pitch.read(f)
  time <- round(pitch_object$t,3)*1000
  pitch_ceiling <- 1000 #fixed to periods up to 1000Hz
  strength_row1 <- apply(pitch.toArray(pitch_object)[["strengthArray"]], 2, function(x) x[1])
  f0_row1 <- apply(pitch.toArray(pitch_object)[["frequencyArray"]], 2, function(x) x[1])
  f0_row1[which(f0_row1==0)] <- NA
  strengthArray <- apply(as.data.frame(pitch.toArray(pitch_object)[["strengthArray"]]), 2, function(x) ifelse(x==0,NA,x))
  freqArray <- apply(as.data.frame(pitch.toArray(pitch_object)[["frequencyArray"]]), 2, function(x) ifelse(x==0,NA,x))
  zero_one_freqs <- apply(freqArray, 2, function(x) ifelse(x>pitch_ceiling, 0, 1))
  strength_limited <- strengthArray
  strength_limited[, -1] <- mapply(`*`, strengthArray[, -1], zero_one_freqs[, -1])
  strength_rowmax <- apply(strength_limited, 2, max, na.rm=T)
  strength_rowmax[is.infinite(strength_rowmax)] <- 0
  data.frame(file, t=as.integer(as.character(time)), strength_row1=round(strength_row1,7), f0_row1=round(f0_row1,2), strength_rowmax=round(strength_rowmax,7))
})
```

# Read TextGrids: 'syllables' tier

```{r pre_prepare_TextGrids_Syllable, warning=FALSE}

##### TextGrid
# TextGrids are optional! They are useful for exposition and to improve the automatic detection. By default, TextGrids are expected with at least one interval tier demarcating syllabic boundaries
#
files_textGrid <- list.files(path="praat_data/textgrids/", pattern="*.TextGrid",full.names=T)
# The following chunk takes syllabic intervals and labels from the "Syllable" tier
if(length(files_textGrid)>0) textGridSyll_df <- plyr::ldply(files_textGrid, function(f){
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2]
  tg <- tg.read(f, encoding = "auto")
  syll_tier <- data.frame(tg$syllables)
  t1 <- ifelse(syll_tier$label=="", NA, round(syll_tier$t1,3)*1000)
  t2 <- ifelse(syll_tier$label=="", NA, round(syll_tier$t2,3)*1000)
  t_mid <- round((t1+t2)/2)
  syll_label <- syll_tier$label
  data.frame(file, t=as.integer(as.character(t1)), syll_start=as.integer(as.character(t1)), syll_mid=as.integer(as.character(t_mid)), syll_end=as.integer(as.character(t2)), syll_bounds=as.integer(as.character(t1)), syll_label)
})
if(exists("textGridSyll_df")) textGridSyll_df <- mutate(
  group_by(textGridSyll_df,file),
  syll_bounds = ifelse(
    (is.na(syll_bounds) & !is.na(lag(syll_end,1))),
    lag(syll_end,1), 
    syll_bounds),
  t = syll_bounds
  )
#
if(exists("textGridSyll_df")) textGridSyll_df <- filter(textGridSyll_df, !is.na(t))

```

# Read TextGrids: 'words' tier

```{r pre_prepare_TextGrids_Word, warning=FALSE}

# The following chunk takes word intervals and labels from the "Word" tier (for exposition purposes)
if(length(files_textGrid)>0) textGridWord_df <- plyr::ldply(files_textGrid, function(f){
  filename <- str_match(f,".*/([^/.]*)\\.[^/]*$")
  file <- filename[,2]
  tg <- tg.read(f, encoding = "auto")
  word_tier <- data.frame(tg$words)
  word_t1 <- ifelse(word_tier$label=="", NA, round(word_tier$t1,3)*1000)
  word_t2 <- ifelse(word_tier$label=="", NA, round(word_tier$t2,3)*1000)
  word_t_mid <- round((word_t1+word_t2)/2)
  word_label <- word_tier$label
  data.frame(file, t=as.integer(as.character(word_t1)), word_start=as.integer(as.character(word_t1)), word_mid=as.integer(as.character(word_t_mid)), word_end=as.integer(as.character(word_t2)), word_bounds=as.integer(as.character(word_t1)), word_label)
})
if(exists("textGridWord_df")) textGridWord_df <- mutate(
  group_by(textGridWord_df,file),
  word_bounds = ifelse(
    (is.na(word_bounds) & !is.na(lag(word_end,1))),
    lag(word_end,1), 
    word_bounds),
  t = word_bounds
  )
#
if(exists("textGridWord_df")) textGridWord_df <- filter(textGridWord_df, !is.na(t))
  
```

# Combine data into raw_df

```{r prepare_raw_df, warning=FALSE}

##### Combine all data
raw_df <- left_join(fullTime_df, f0_smooth_df, by = c("file", "t"))
raw_df <- left_join(raw_df, intensity_df, by = c("file", "t"))
raw_df <- left_join(raw_df, pitchObject_df, by = c("file", "t"))
if(exists("textGridSyll_df")) raw_df <- left_join(raw_df, textGridSyll_df, by = c("file", "t"))
if(exists("textGridWord_df")) raw_df <- left_join(raw_df, textGridWord_df, by = c("file", "t"))

##### Calculate and add the total-power and the periodic-power vectors
raw_df <- mutate(group_by(raw_df,file),
                  total_power = round(4e-10 * 10^(intensity / 10), 9),
                  periodic_power = round(total_power * strength_rowmax, 9)
                  )

```

# Write raw_df table

```{r write_raw_df, warning=FALSE}
##### Write the raw data
write.csv(raw_df, "data_tables/raw_df.csv", row.names=FALSE)
```