########################## #### UN- training set #### ########################## rm(list = ls()) load("G:/Lehre/Spring School Bolzano 2021/Material/baroni.rda") words <- rownames(baroni) ## all words starting with un- un <- words[grep(x = words, pattern= "^un")] ## strip the "un" candidates <- gsub(un,pattern="un",replacement = "") ## which words are still in the semantic space? found <- intersect(candidates,words) found # note: this contains trash, but we just accept that # what can we do to make this better? ## create the training set trainset <- data.frame("un-", found, paste("un",found,sep="")) ## export the training set write.table(trainset, file="G:/Lehre/Spring School Bolzano 2021/Material/UN_trainset.txt", quote = F,row.names = F,col.names = F) ## the application set applset <- trainset applset[,3] <- paste(applset[,3],"__cmp",sep="") write.table(applset, file="G:/Lehre/Spring School Bolzano 2021/Material/UN_applset.txt", quote = F,row.names = F,col.names = F)