from SensevalParser import *
from SensevalStructs import *
from math import sqrt
import cPickle
import sys

"""
Dispy:
    This class does word sense disambiguation on a set of ambiguous words, using 
    training and test data from SENSEVAL-3.
    
Parameters:
    language: a string, the language (and therefore training and test files) 
    	to use.
    winSize: an int, the maximum context window size
"""
class Dispy:
    def __init__(self, language, winSize = 500):
        self.cfiers = ('cos', 'bayes', 'dlist')
        self.confDicts, self.weightDicts, self.limits = {}, {}, {}
        self.wDict, self.results = {}, {}
        self.filePath = "/local/seed/katz/senseval/%s/" % language
        self.language = language
        self.bagFlag = False
        self.alphaDict = {'basque': .00003, 'catalan': .0005, 'spanish': \
        				  .00005, 'italian': .0005, 'romanian': .00001}
        self.alpha = self.alphaDict[self.language]
        self.winSize = winSize
        self.tempFile = open( "%stemp.tmp" % self.filePath, "w" )
        self.pChooser = SensevalParser()
        
    def __del__( self ):
        self.tempFile.close()
    
    """
    main:
       This function is the main core of Dispy.  It either generates or loads 
       the necessary data structures (confDict and wDict), and then 
       disambiguates a test set.  It then calls printResults.
   
    Parameters:
        CVFlag: a boolean, True means do cross-validation, False means load 
        	confDict from file
        WDFlag: a boolean, True means do training, False means load wDict 
        	from file
        (if CVFlag is true, WDFlag is automatically set to True)
    """
    def main(self, CVFlag = True, WDFlag = True):
        if CVFlag:
            self.selectParser('train')
            self.crossValidate(10)
            del self.pTrain
            print "Finished Cross Validation"
            
            self.selectParser('train')
            self.createWordDict()
            del self.pTrain
            print "Finished Creating Word Dict"
        else:
            self.loadConfDicts()
            print "Finished Loading Conf Dicts"
            if WDFlag:
                self.selectParser('train')
                self.createWordDict()
                del self.pTrain
                print "Finished Creating Word Dict"
            else:
                self.loadWordDict()
                print "Finished Loading Word Dict"
                
        self.selectParser('test')
        self.teSet, aWord = self.pTest.getNextSet()
        while self.teSet != None:
            self.limits[aWord] = {'bayes': {'min': None, 'max': None}, \
                                  'cos': {'min': None, 'max': None}, \
                                  'dlist': {'min': None, 'max': None}}
            self.results[aWord] = self.disambiguate(aWord)
            self.setLimits(self.results[aWord])
            self.getCombined(aWord)
            del self.teSet
            print "Finished Disambiguation: %s" % aWord
            self.teSet, aWord = self.pTest.getNextSet()
        del self.pTest
        self.printResults()
    
    """
    crossValidate:
       This function divides a trainSet into ten sections, then cross 
       validates on each section.  It stores the max and min scores from 
       each classifier, and then loads its results into confDicts.
   
    Parameters:
        crossNum: an int, the number of times that it cross-validates.
    """    
    def crossValidate(self, crossNum):
        resultList = []
        trainSet, aWord = self.pTrain.getNextSet()
        while trainSet != None:
            self.weightDicts[aWord], self.confDicts[aWord] = {}, {}
            self.limits[aWord] = {}
            for cfier in self.cfiers:
                self.limits[aWord][cfier] = {'min': None, 'max': None}
                self.weightDicts[aWord][cfier] = Weight()
                self.confDicts[aWord][cfier] = ConfDict()
            fraction = 1.0/float(crossNum)
            trainSections = []
            
            for i in range(crossNum):
                (lowBound, hiBound) = (fraction * float(i), fraction * float(i+1))
                trainSections.append(trainSet[int(lowBound * len(trainSet)):\
                                                int(hiBound * len(trainSet))])
            del trainSet
            
            for i in range(crossNum):
                self.trSet = []
                tempSet = [trainSections[j] for j in range(crossNum) if j != i]
                for set in tempSet:
                    self.trSet.extend(set)
                self.train(aWord)
                print "Finished Training: %s:\tCV # %d" % (aWord, i)
                self.teSet = trainSections[i]
                results = self.disambiguate(aWord)
                resultList.extend(results)
                self.wDict = {}
                del results
                print "Finished Disambiguating: %s:\tCV # %d" % (aWord, i)
            trainSet, aWord = self.pTrain.getNextSet()
            
        self.weightDicts['total'] = {'bayes': Weight(), 'cos': Weight(), \
                                     'dlist': Weight()}
        self.setLimits(resultList) 
        
        #Cross Validation finished, compute confDicts
        for result in resultList:
            for cfier in self.cfiers:
                tDict = result.guessDicts[cfier]
                bestSense = tDict.keys()[0]
                hiScore = tDict[bestSense]
                for sense in tDict.iterkeys():
                    if tDict[sense] > hiScore:
                        hiScore = tDict[sense]
                        bestSense = sense
                        
                isCorr = False
                for answer in result.answers:
                    if bestSense == answer:
                        isCorr = True
                        
                self.confDicts[result.word][cfier].addConf(self.normalize(\
                                            result.word, cfier, hiScore), isCorr)
                self.weightDicts[result.word][cfier].addWord(isCorr)
                self.weightDicts['total'][cfier].addWord(isCorr)
                
        file = open("%sCVWords.data" % self.filePath, "w")
        for aWord in self.wDict.iterkeys():
            file.write("**** %s ****\n" % aWord)
            for cfier in self.cfiers:
                self.weightDicts[aWord][cfier].calcProb()
                self.confDicts[aWord][cfier].countToProb()
                file.write("%s\t %.2f%%\n" % (cfier, 100. * \
                                  self.weightDicts[aWord][cfier].prob))
        file.close()
        for cfier in self.cfiers:
            self.weightDicts['total'][cfier].calcProb()
        file = open("%scDict.pickle" % self.filePath, "w")
        cPickle.dump(self.confDicts, file)
        file.close()
        file = open("%sweightDicts.pickle" % self.filePath, "w")
        cPickle.dump(self.weightDicts, file)
        file.close()
        self.limits = {}
            
    """
    train:
        This function trains on self.trSet.  It assumes that all instances in 
        self.trSet have the string aWord as the ambiguous word.  It creates the 
        representative cosine vector, the decision list, and the bayesian 
        feature distribution.
        
    Parameters:
        aWord: a string, the ambiguous word
    """     
    def train(self, aWord):
        for inst in self.trSet:
            (tWord, tsSenses) = (inst.head.word, inst.head.tags['senses'])
            for tSense in tsSenses:
                if not self.wDict.has_key(tWord):
                    self.wDict[tWord] = Word(self.alpha, tWord)
                flag = True
                for sense in self.wDict[tWord].sDict.itervalues():
                    if sense.label == tSense:
                        flag = False
                if flag:
                    self.wDict[tWord].sDict[tSense]= Sense(tSense) 
                self.wDict[tWord].sDict[tSense].incOccurances()
                for i in range(len(inst.context)):
                    if inst.context[i].tags.has_key('head'):
                        self.wDict[tWord].sDict[tSense].addWords(\
                                                         self.genFeatures(i, \
                                             inst.context, lambda x : x.word, "W"))
                        self.wDict[tWord].addToContext(self.genFeatures(i, \
                        											inst.context, \
                                                           lambda x : x.word, "W"))
                    else:
                        self.wDict[tWord].sDict[tSense].addWords(\
                                                            [inst.context[i].word])
                        self.wDict[tWord].addToContext([inst.context[i].word])
        self.calcGlobals(aWord)
        self.makeRuleList(aWord)
        self.wDict[aWord].contextDictToList()
        self.wDict[aWord].createWordVectors()

    """
    disambiguate:
       This function disambiguates the instances in self.teSet.  It fills a 
       list of Results with the guesses from each classifier.
       
    Parameters:
       aWord: a string, the ambiguous word
       
    Returns:
       results: a list of Results, containing the guesses of each classifier
    """
    def disambiguate(self, aWord):
        results = [Result('none') for x in range(len(self.teSet))]
        for j in range(len(self.teSet)):
            tWord = self.teSet[j].head.word, 
            tSense = self.teSet[j].head.tags['senses']
            tList = self.teSet[j].context
            results[j].setAnswers(tSense, tWord)
            if tWord == aWord:
                #Cosine
                testVector = self.createTestVector(self.teSet[j])
                #Naive Bayes
                for sense in self.wDict[tWord].sDict.iterkeys():
                    self.wDict[tWord].sDict[sense].resetScore()
                    for i in range(len(tList)):
                        if tList[i].tags.has_key('head'):
                            feats = self.genFeatures(i, tList, \
                            						 lambda x : x.word, "W")
                        else:
                            feats = [tList[i].word]
                        self.wDict[tWord].sDict[sense].incScore(feats)
                #Decision List
                featCount = {}
                for i in range(len(tList)):
                    if tList[i].word == aWord:
                        for feat in self.genFeatures(i, tList, \
                                                     lambda x : x.word, "W"):
                            featCount[feat] = True
                    else:
                        featCount[tList[i].word] = True
                for sense in self.wDict[tWord].sDict.itervalues():
                    #Decision List
                    tAns = self.wDict[tWord].getDLGuess(sense.label, featCount)
                    results[j].addGuess('dlist', tAns.sense, tAns.weight)
                    #Cosine
                    cosine = self.getCosine(testVector, \
                                   self.wDict[tWord].sDict[sense.label].wordVector)
                    results[j].addGuess('cos', sense.label, cosine)
                    #Naive Bayes
                    results[j].addGuess('bayes', sense.label, sense.score)
            else:     
                print "|%s|  |%s|" % (tWord, aWord)
                print "***************aWord ERROR***********************"
        return results
                
    """
    printResults:
        This function displays the results stored in self.results.
    """
    def printResults(self):
        [total, comb_total, oracle_total] = [0.0 for i in range(3)]
        resultsDict = {}
        for cfier in self.cfiers: 
            resultsDict[cfier] = 0.0
        for aWord in self.results.iterkeys():
            [tTotal, tComb_total, tOracle_total] = [0.0 for i in range(3)]
            tResultsDict = {}
            for cfier in self.cfiers: 
                tResultsDict[cfier] = 0.0
            for result in self.results[aWord]:
                oracle = False
                for cfier in self.cfiers:
                    guess = self.getBestGuess(result.word, cfier, result[cfier])
                    isCorr = False
                    for answer in result.answers:
                        if guess == answer:
                               resultsDict[cfier] += 1.0
                               tResultsDict[cfier] += 1.0
                               oracle = True
                               
                comb_guess = result.guessDicts['combined'].keys()[0]
                for answer in result.answers:
                    if comb_guess == answer:
                        comb_total += 1.0  
                        tComb_total += 1.0 
                if oracle:
                    oracle_total += 1.0 
                    tOracle_total += 1.0      
                    oracle == False
                total += 1.0
                tTotal += 1.0
                
            print "%s Disambiguated: %d" % (aWord, tTotal)
            print "Cosine: %.2f%%" % (100.0 * tResultsDict['cos'] / tTotal)
            print "D-List: %.2f%%" % (100.0 * tResultsDict['dlist'] / tTotal)
            print "Bayes : %.2f%%" % (100.0 * tResultsDict['bayes'] / tTotal)
            print "Ptery : %.2f%%" % (100.0 * tComb_total / tTotal)
            print "Oracle: %.2f%%\n" % (100.0 * tOracle_total / tTotal)
            
        print "Total Words Disambiguated: %d" % (total)
        print "Cosine: %.2f%%" % (100.0 * resultsDict['cos'] / total)
        print "D-List: %.2f%%" % (100.0 * resultsDict['dlist'] / total)
        print "Bayes : %.2f%%" % (100.0 * resultsDict['bayes'] / total)
        print "Ptery : %.2f%%" % (100.0 * comb_total / total)
        print "Oracle: %.2f%%" % (100.0 * oracle_total / total)
        
    """
    selectParser:
        This function sets self.pTest or self.pTrain to the correct parser
        
    Parameters:
        arg: a string, either 'test' or 'train'
    """
    def selectParser(self, arg):
        if arg == 'test':
            self.pTest = self.pChooser.chooseParser(self.language, arg, \
                                                    self.winSize)
        if arg == 'train':
            self.pTrain = self.pChooser.chooseParser(self.language, arg, \
                                                     self.winSize)
        
    """
    createWordDict:
       This function is basically a wrapper for train.  It loops through each 
       lexical element, training on each one and filling the wDict
    """
    def createWordDict(self):
        google = False
        if google == True:
            g = GoogleParser()
            q = g.readFile(\
            '/home/msingle1/dispy/src/google/google-searches/xml/Spanish/', 'foo' )
            self.trSet += q
            del g
            del q
        self.trSet, aWord = self.pTrain.getNextSet()
        while self.trSet != None:
            self.train(aWord)
            del self.trSet
            print "Finished Training: %s" % aWord
            self.trSet, aWord = self.pTrain.getNextSet()
        file = open("%swDict.pickle" % self.filePath, "w")
        cPickle.dump(self.wDict, file)
        file.close()
        print "Finished Training"
    
    """
    loadConfDicts:
    	loads the confidence Dictionary from the file in the self.filepath folder
    """
    def loadConfDicts(self):
        file = open("%scDict.pickle" % self.filePath, "r")
        self.confDicts = cPickle.load(file)
        file.close()
        file = open("%sweightDicts.pickle" % self.filePath, "r")
        self.weightDicts = cPickle.load(file)
        file.close()
        
    """
    loadWordDict:
    	loads the Word Dictionary from the file in the self.filepath folder
    """
    def loadWordDict(self):
        file = open("%swDict.pickle" % self.filePath, "r")
        self.wDict = cPickle.load(file)
        file.close()
        
    """
    setLimits:
        This function runs through a list of Results, finding the max and min score 
        for each classifier, and saves those into self.limits
        
    Parameters:
        resultList: a list of Results, filled with guesses and scores
    """
    def setLimits(self, resultList):
        for cfier in self.cfiers:
            tConf = resultList[0][cfier].values()[0]
            for result in resultList:
                self.limits[result.word][cfier]['min'] = tConf
                self.limits[result.word][cfier]['max'] = tConf
                maxConf = minConf = result[cfier].values()[0]
                for conf in result[cfier].itervalues():
                    if conf != None:
                        maxConf = max(maxConf, conf)
                        minConf = min(minConf, conf)
                self.limits[result.word][cfier]['max'] = \
                            max(self.limits[result.word][cfier]['max'], maxConf)
                self.limits[result.word][cfier]['min'] = \
                            min(self.limits[result.word][cfier]['min'], minConf)
    
    """
    calcGlobals:
        This function calculates the frequencies of each sense of a given word
        
    Parameters:
        word: a string, the ambiguous word
    """
    def calcGlobals(self, word):
        total = 0.0
        for sense in self.wDict[word].sDict.itervalues():
            total += sense.occurrences
        for sense in self.wDict[word].sDict.iterkeys():
            self.wDict[word].sDict[sense].setProb(\
				            self.wDict[word].sDict[sense].occurrences / total)
				            
    """
    gram:
        This is a helper function for genFeatures
    
    Parameters: 
        key: a string, a label for the feature
        lst: a list of strings, the words
        c: the index of the ambiguous word
        s: an int, relative address of left end of n-gram
        e: an int, relative address of right end of n-gram
        entries: a list of ints, used to create non-continuous n-grams
    """
    def gram( self, key, lst, c, s=None, e=None, entries=None ):
        if s == None:
            s = c
        if e == None:
            e = s   
        if entries is not None:
            strEntries = map( lambda x : str( x ), entries )
            label = "__%s[%s]__" % ( key, ",".join( strEntries ) )
        elif s == e:
            label = "__%s[%d]__" % ( key, s )
        else:
            label = "__%s[%d:%d]__" % ( key, s, e+1 )
        if entries is not None:
            try:
                rlst = [lst[i+c] for i in entries]
            except:
                rlst = []
            rv = label + "__".join( rlst )
        else:
            rv = label + "__".join( lst[c+s:c+e+1] )
        return rv

    """
    genFeatures
        This function generates a list of features around a given word
    
    Parameters: 
        index: an int, the index of the ambiguous word
        lst:   a list of entries, the context of the word
        fn:    a function, determining which field of the entry to use
        label: a string, a label for the feature
    """
    def genFeatures( self, index, lst, fn, label ):
        ( wordFlag, biFlag, triFlag, weight ) = ( True, True, True, 10 )
        maxNgram = 3
        sidx = max( index - maxNgram, 0 )
        eidx = min( index + maxNgram, len( lst )-1 )
        words = [''] * ( 2 * maxNgram + 1 )
        tgt = maxNgram
        for i in range( sidx, eidx+1 ):
            words[i-index+maxNgram] = fn( lst[i] )
        tags = []
        tags.append( self.gram( label, words, tgt, 0 ) )
        tags.append( self.gram( label, words, tgt, -1 ) )
        tags.append( self.gram( label, words, tgt, +1 ) )
        if biFlag:
            tags.append( self.gram( label, words, tgt, -1, 0 ) )
            tags.append( self.gram( label, words, tgt, -2, -1 ) )
            tags.append( self.gram( label, words, tgt, 0, +1 ) )
            tags.append( self.gram( label, words, tgt, +1, +2 ) )
            tags.append( self.gram( label, words, tgt, entries=[-1, +1] ) )
        if triFlag:
            tags.append( self.gram( label, words, tgt, -2, -0 ) )
            tags.append( self.gram( label, words, tgt, -3, -1 ) )
            tags.append( self.gram( label, words, tgt, 0, +2 ) )      
            tags.append( self.gram( label, words, tgt, +1, +3 ) )
            tags.append( self.gram( label, words, tgt, -1, +1 ) )
            
        if weight != -1:
            tlst = [x+1 for x in range( weight )]
            for i in tlst:
                if 0 <= index-i:
                    for j in range( weight+1-i ):
                        tags.append( lst[index-i].word )
                if index+i < len( lst ):
                    for j in range( weight+1-i ):
                        tags.append( lst[index+i].word )
        return tags
                 
    """
    normalize:
        This function adjusts a classifier score so that it is between 0 and 1.  
        self.limits must be set.
        
    Parameters:
        aWord: a string, the ambiguous word
        cfier: the classifier in question
        score: the score to be adjusted
    """
    def normalize(self, aWord, cfier, score):
        conf = (score - self.limits[aWord][cfier]['min']) / \
                (self.limits[aWord][cfier]['max'] - \
                  self.limits[aWord][cfier]['min'])
        return conf
    
    """
    convertConf:
        This function converts classifier scores between 0 and 1 to confidences, 
        based on the self.confDicts "buckets"
    
    Parameters:
        aWord: a string, the ambiguous word
        cfier: the classifier in question
        conf:  the score to be converted to a confidence
    """
    def convertConf(self, aWord, cfier, conf):
        score = min(max(0, self.normalize(aWord, cfier, conf)), .99)
        index = int(10*score)
        return self.confDicts[aWord][cfier][index]
    
    """
    createTestVector:
        This function takes an instance and creates a feature vector for cosine 
        comparison
        
    Parameters:
        instance: an Instance of an ambiguous word
        
    Returns:
        teVector: a feature vector
    """
    def createTestVector(self, instance):
        cDict = {}
        teVector = []
        for entry in instance.context:
            if cDict.has_key(entry.word):
                cDict[entry.word] += 1.0
            else:
                cDict[entry.word] = 1.0
        tWord = instance.head.word
        for i in range(len(self.wDict[tWord].cList)):
            if cDict.has_key(self.wDict[tWord].cList[i]):
                value = float(cDict[self.wDict[tWord].cList[i]]) * \
                        self.wDict[tWord].idfVector[i]
            else:
                value = 0.0
            teVector.append(value)  
        return teVector
    
    """
    getCosine:
        This function takes two vectors and computes the cosine between them
    
    Parameters:
        v1: a vector of floats
        v2: a vector of floats
        
    Returns:
        cos: a float, the cosine between the vectors
    """
    def getCosine(self, v1, v2):
        if len(v1) != len(v2):
            print "**** VECTOR ERROR: NOT THE SAME DIMENSIONS: %d , %d ****" % \
                              (len(v1), len(v2))
        dotProd = 0.0
        (mathLen1, mathLen2) = (0.0, 0.0)
        for i in range(len(v1)):
            dotProd += float(v1[i]) * float(v2[i])
            mathLen1 += float(v1[i]) ** 2
            mathLen2 += float(v2[i]) ** 2
        mathLen1 = sqrt(mathLen1)
        mathLen2 = sqrt(mathLen2)
        bottom = mathLen1 * mathLen2
        if bottom == 0.0:
            print "**** VECTOR ERROR:ZERO COSINE ****"
            return 0.0
        return dotProd / bottom
    
    """
    makeRuleList:
        This function converts the ruleDict in a Word struct to a list
        
    Parameters:
        word: a string, the ambiguous word
    """
    def makeRuleList(self, word):
        for sense in self.wDict[word].sDict.iterkeys():
            self.wDict[word].sDict[sense].setAlpha(self.alpha)
            for feat in self.wDict[word].sDict[sense].featCount.iterkeys():
                self.wDict[word].incFeature(feat, \
                                   self.wDict[word].sDict[sense].featCount[feat])
        for sense in self.wDict[word].sDict.iterkeys():
            for feat in self.wDict[word].sDict[sense].featCount.iterkeys():
                self.wDict[word].addRule(feat, sense, \
                                   self.wDict[word].sDict[sense].featCount[feat])
        self.wDict[word].ruleDictToList()
  
    """
    getCombined:
        This function fills self.results with the combiner guesses for each 
        instance
        
    Parameters:
        aWord: a string, the ambiguous word
    """
    def getCombined(self, aWord):
        for result in self.results[aWord]:
            (tWord, tsSenses) = (result.word, result.answers)
            comb_guess = self.combined(aWord, result)
            result.addGuess('combined', comb_guess, 0.0)
        
    """
    combined:
        This function calculates the combiner guess by summing weighted confidences 
        from each classifier
    
    Parameters:
        aWord: a string, the ambiguous word
        result: the Result that the combiner is guessing about
    
    Returns:
        retVal: a string, the combiner guess
    """
    def combined(self, aWord, result):
        guessCounts = {}
        maxScore = 0.0
        retVal = "NONE"
        weights = {}
        for cfier in self.cfiers:
            weights[cfier] = self.weightDicts[aWord][cfier].prob * \
                             self.weightDicts['total'][cfier].prob
        for cfier in self.cfiers:
            sDict = {}
            for sense in result.guessDicts[cfier].iterkeys():
                score = weights[cfier] * max(0, self.convertConf(aWord, cfier, \
                                                  result.guessDicts[cfier][sense]))
                sDict[sense] = [score, sense, result.guessDicts[cfier][sense]]
            senseList = sDict.values()[:]
            senseList.sort(lambda x, y: cmp(x[2], y[2]))
            for i in range(len(senseList)):
                senseList[i][0] += i * .00000001
                if guessCounts.has_key(senseList[i][1]):
                    guessCounts[senseList[i][1]] += senseList[i][0]
                else:
                    guessCounts[senseList[i][1]] = senseList[i][0]
        for sense in guessCounts.iterkeys():
            if guessCounts[sense] > maxScore:
                maxScore = guessCounts[sense]
                retVal = sense
        if retVal == "NONE":
            return "************ Combiner Error **************"
        return retVal
        
    """
    getBestGuess:
        This function determines the guess with the highest score for a given 
        classifier
    
    Parameters:
        word: a string, the ambiguous word
        cfier: the classifier in question
        dict: a dictionary, keys are senses, values are scores
    
    Returns:
        retVal: the classifier's guess
    """
    def getBestGuess(self, word, cfier, dict):
        retVal = "none"
        maxVal = 0.0
        for sense in dict.iterkeys():
            conf = self.normalize(word, cfier, dict[sense])
            if conf > maxVal:
                maxVal = conf
                retVal = sense
        return retVal
    
if __name__ == "__main__":
    if len(sys.argv) == 2:
        lang = sys.argv[1]
    else:
        lang = 'catalan'
    print "***** %s *****" % lang
    d = Dispy(lang)
    d.main(CVFlag = True, WDFlag = True)
    del d

