##########################
#### UN- training set ####
##########################

rm(list = ls())

load("G:/Lehre/Spring School Bolzano 2021/Material/baroni.rda")

words <- rownames(baroni)

## all words starting with un-
un <- words[grep(x = words, pattern= "^un")]

## strip the "un"
candidates <- gsub(un,pattern="un",replacement = "")

## which words are still in the semantic space?
found <- intersect(candidates,words)
found
# note: this contains trash, but we just accept that
#      what can we do to make this better?

## create the training set
trainset <- data.frame("un-",
                       found,
                       paste("un",found,sep=""))

## export the training set
write.table(trainset, file="G:/Lehre/Spring School Bolzano 2021/Material/UN_trainset.txt",
            quote = F,row.names = F,col.names = F)

## the application set
applset <- trainset
applset[,3] <- paste(applset[,3],"__cmp",sep="")

write.table(applset, file="G:/Lehre/Spring School Bolzano 2021/Material/UN_applset.txt",
            quote = F,row.names = F,col.names = F)