from Buffer import Buffer
from SensevalStructs import Entry, Instance
import os, re, codecs
import xml.sax

class SensevalParser:
    def chooseParser(self, language, arg, win):
        if language == 'basque':
            return BasqueParser(arg, win)
        elif language == 'catalan':
            return CatalanParser(arg, win)
        elif language == 'spanish':
            return SpanishParser(arg, win)
        elif language == 'italian':
            return ItalianParser(arg, win)
        elif language == 'romanian':
            return RomanianParser(arg, win)
        else:
            print "************************PARSER ERROR***************************"

"""
LexeltHandler:
    Subclasses xml.sax.ContentHandler.  This is an "event-driven" parser (read
    about it, it's cool).  It should be fed one lexelt at a time, starting at
    the opening lexelt tag and ending with the closing lexelt tag.  It parses
    the XML chunk and fills self.corpus with a list of Instance objects.

Parameters:
    corpus:   an empty list that will be filled with Instance objects
    winSize:  the desired window size
    testFlag: true if test set, false if train set
    ansDict:  an answer dict
"""
class LexeltHandler(xml.sax.ContentHandler):
    def __init__(self, corpus, winSize, testFlag, ansDict):
        self.winSize = winSize
        self.tCorpus = Buffer(self.winSize)
        self.corpus = corpus
        self.senses = []
        self.testFlag = testFlag
        self.headFlag = False
        self.headCount = 0
        self.lemFlag = False
        self.doneFlag = False
        self.instanceRE = re.compile("^<instance.* id=\"([^\"]*)")
        self.answerDict = ansDict
        self.ID = "D-BAG"

    def startElement(self, tag, attrs):
        if self.testFlag:
            if tag == "instance":
                self.senses.append(self.answerDict[attrs.getValueByQName('id')])
                self.ID = attrs.getValueByQName('id')
        else:
            if tag == 'answer':
                self.senses.append(attrs.getValueByQName('senseid'))
                self.ID = attrs.getValueByQName('instance')
        if self.doneFlag:
            return
        if tag == 'w':
            attrns = attrs.getQNames()
            tags = {}
            for attrn in attrns:
                if attrn == 'head':
                    tags['head'] = True
                    tags['senses'] = self.senses
                    self.headFlag = True
                    self.lemFlag = True
                elif attrn == 'frm':
                    word = attrs.getValueByQName('frm')
                else:
                    tags[attrn] = attrs.getValueByQName(attrn)
            if self.headFlag:
                if self.headCount > self.winSize/2.0:
                    self.doneFlag = True
                    return
                else:
                    self.headCount += 1
            if self.lemFlag:
                self.lemFlag = False
                tags["ID"] = self.ID
                self.tCorpus.push(Entry(tags['lem'], tags))
            else:
                self.tCorpus.push(Entry(word, tags))

    def endElement(self, tag):
        if tag == 'instance':
            i = 0
            head = None
            for entry in self.tCorpus.list():
                if entry.tags.has_key('head'):
                    head = i
                i += 1
            self.corpus.append(Instance(self.tCorpus.list(), head, self.tCorpus.list()[head]))
            del self.tCorpus
            self.tCorpus = Buffer(self.winSize)
            self.senses = []
            self.headFlag = False
            self.headCount = 0
            self.doneFlag = False

"""
CatalanParser:
    This is a sexy XML parser that wraps the nice LexeltHandler above.
    Originally, I had hoped that this parser would work for all of the
    languages, but the XML files are too inconsistent.  Additionally, they are
    not quite valid XML, so Python's XML handler fails.  But it still works
    well for Catalan. :)

Parameters:
    arg:        "test" or "train" depending on the file to parse 
    win:        the desired window size
    trainFile:  (optional) path of train file
    testFile:   (optional) path of test file
    answerFile: (optional) path of answer file
"""
class CatalanParser:
    def __init__(self, arg, win, trainFile='/local/seed/katz/senseval/catalan/catalan.train', \
                 testFile='/local/seed/katz/senseval/catalan/catalan.test', \
                 answerFile='/local/seed/katz/senseval/catalan/catalan.answers'):
        self.testFlag = False
        self.winSize = win
        self.answerRE = re.compile("^(\S+) (\S+) (\S+)$")
        self.answerDict ={}
        if arg == 'test':
            self.testFlag = True
            fileName = testFile
        else:
            fileName = trainFile
        try:
            self.file = codecs.open(fileName, 'r', 'latin1')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        if self.testFlag:
            self.answerDict = self.makeAnswerDict(answerFile)
        self.lexeltRE = re.compile("^<lexelt item=\"([^\.]*)")
    
    def __del__(self):
        self.file.close()

    """
    getNextSet:
        Returns a [corpus, aWord] pair of the next lexelt in the XML file.

    Returns:
        corpus: a list of Instance objects
        aWord:  an string (the ambiguous word)
    """
    def getNextSet(self):
        corpus = []
        p = xml.sax.make_parser()
        p.setContentHandler(LexeltHandler(corpus, self.winSize, self.testFlag, self.answerDict))
        inLexelt = False
        lexelt = []
        while True:
            line = self.file.readline()
            if line == '':
                return None, "none"
            if not inLexelt:
                if line[:8] == '<lexelt ':
                    inLexelt = True
                    match = self.lexeltRE.search(line)
                    aWord = match.group(1)
                else:
                    continue
            if line[:8] == '</lexelt':
                inLexelt = False
                lexelt.append(line)
                lexelt = ''.join(lexelt)
                p.feed(codecs.encode(lexelt, 'utf8'))
                p.close()
                return corpus, aWord
            else:
                lexelt.append(line)

    """
    makeAnswerDict:
        makes a dictionary of answers to be used in later in the parser
    
    Parameters:
        fileName: the path of the answer file
    
    Returns:
        retDict: a dictionary of answers
    """
    def makeAnswerDict(self, fileName):
        try:
            file = codecs.open(fileName, 'r', 'latin1')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        else:
            pass
        retDict = {}
        line = file.readline()
        while not line == '':
            match = self.answerRE.search(line)
            retDict[match.group(2)] = match.group(3)
            line = file.readline()
        file.close()
        return retDict
        
class RomanianParser:
    def __init__(self, arg, win, \
                 trainFile = '/local/seed/katz/senseval/romanian/romanian.train', \
                 testFile = '/local/seed/katz/senseval/romanian/romanian.test', \
                 answerFile = '/local/seed/katz/senseval/romanian/romanian.answers'):
        self.winSize = win
        self.testFlag = False
        self.headFlag = False
        self.headCount = 0
        self.doneFlag = False
        if arg == 'test':
            self.testFlag = True
            fileName = testFile
        else:
            fileName = trainFile
        try:
            self.file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        if self.testFlag:
            self.answerDict = self.makeAnswerDict(answerFile)
    
    def __del__(self):
        self.file.close()
            
    def getNextSet(self):
        contextFlag = False
        corpus = []
        tCorpus = Buffer(self.winSize)
        line = self.file.readline()
        aWord = "None"
        while not '</lexelt>' in line and line != "":
            sentence = line.split(' ')
            if contextFlag:
                if sentence[0] == "<w":
                    if self.headFlag:
                        if self.headCount > self.winSize/2.0:
                            self.doneFlag = True
                        self.headCount += 1 
                    thisDict = self.makeDict(sentence)  
                    if not self.doneFlag:
                        if thisDict.has_key('head'):
                            self.headFlag = True
                            self.headCount = 0
                            thisDict['head'] = True
                            thisDict['senses'] = []
                            for sense in senses:
                                thisDict['senses'].append(sense)
                            if not thisDict.has_key("frm"):
                                thisDict["frm"] = thisDict["lem"]
                            tCorpus.push(Entry(aWord, thisDict))
                        else:
                            if not thisDict.has_key("frm"):
                                thisDict["frm"] = thisDict["lem"]
                            tCorpus.push(Entry(thisDict["frm"], thisDict))
                if sentence[0] == "</instance>\n":
                    i = 0
                    head = None
                    for entry in tCorpus.list():
                        if entry.tags.has_key('head'):
                            head = i
                        i += 1
                    thisInstance = Instance(tCorpus.list(), head, tCorpus.list()[head])
                    corpus.append(thisInstance)
                    tCorpus = Buffer(self.winSize)
                    self.doneFlag = False
                    self.headFlag = False
                    self.headCount = 0
                    contextFlag = False
                    section = None
            else:
                if sentence[0] == "<lexelt":
                    aWord = word = self.getWord(sentence)
                if self.testFlag:
                    if sentence[0] == "<instance":
                        senses = self.answerDict[self.getID(sentence)]
                        contextFlag = True
                else:    
                    if sentence[0] == "<answer":
                        senses = self.getSenses(sentence)
                        contextFlag = True
            line = self.file.readline()
        if corpus == []:
            return None, aWord
        return corpus, aWord
    
    def getID(self, sentence):
        for block in sentence:
            if block[0:3] == "id=":
                return block.split('\"')[1]
        return "ERROR"
    
    def makeAnswerDict(self, fileName):
        try:
            file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        else:
            pass
        retDict = {}
        fileList = file.readlines()
        for line in fileList:
            sentence = line.split('\t')
            key = sentence[0]
            sentence = sentence[1:len(sentence)]
            retDict[key] = []
            while len(sentence) > 1:
                retDict[key].append(sentence[0])
                sentence = sentence[1:len(sentence)]
            retDict[key].append(sentence[0][:len(sentence[0])-1])
        file.close()
        return retDict
        
    def makeDict(self, sentence):
        dict = {}
        sentence = sentence[1:len(sentence)]
        for block in sentence:
            pieces = block.split("=")
            dict[pieces[0]] = pieces[1].split('\"')[1]
        return dict
        
    def getWord(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[0]
        
    def getSenses(self, sentence):
        block = sentence[2].split('\"')
        items = block[1].split("/")
        return items
        
    def newSection(self, word):
        if word[0] == '<' and word[1] != '/':
            return True
        return False
    
    def getTag(self, word):
        block = word.split('_')[0]
        retVal = block.split('<')[1]
        return retVal

class SpanishParser:
    def __init__(self, arg, win, trainFile='/local/seed/katz/senseval/spanish/spanish.train',\
                 testFile='/local/seed/katz/senseval/spanish/spanish.test', \
                 answerFile='/local/seed/katz/senseval/spanish/spanish.answers'):
        self.winSize = win
        self.testFlag = False
        self.headFlag = False
        self.headCount = 0
        self.doneFlag = False
        if arg == 'test':
            self.testFlag = True
            fileName = testFile
        else:
            fileName = trainFile
        try:
            self.file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        if self.testFlag:
            self.answerDict = self.makeAnswerDict(answerFile)
    
    def __del__(self):
        self.file.close()
        
    def getNextSet(self):
        contextFlag = False
        corpus = []
        tCorpus = Buffer(self.winSize)
        line = self.file.readline()
        aWord = "None"
        while not '</lexelt>' in line and line != "":
            sentence = line.split(' ')
            if contextFlag:
                if sentence[0] == "<w":
                    if self.headFlag:
                        if self.headCount > self.winSize/2.0:
                            self.doneFlag = True
                        self.headCount += 1 
                    thisDict = self.makeDict(sentence)    
                    if not self.doneFlag:
                        if thisDict.has_key('head'):
                            self.headFlag = True
                            self.headCount = 0
                            if not thisDict.has_key("lem"):
                                thisDict["lem"] = thisDict["frm"]
                            thisDict['senses'] = [sense]
                            thisDict['head'] = True
                            tCorpus.push(Entry(aWord, thisDict))
                        else:
                            tCorpus.push(Entry(thisDict["frm"], thisDict))
                if sentence[0] == "</instance>\n":
                    i = 0
                    head = None
                    for entry in tCorpus.list():
                        if entry.tags.has_key('head'):
                            head = i
                        i += 1
                    thisInstance = Instance(tCorpus.list(), head, tCorpus.list()[head])
                    corpus.append(thisInstance)
                    tCorpus = Buffer(self.winSize)
                    contextFlag = False
                    self.doneFlag = False
                    self.headFlag = False
                    self.headCount = 0
            else:
                if sentence[0] == "<lexelt":
                    aWord = word = self.getWord(sentence)
                if self.testFlag:
                    if sentence[0] == "<instance":
                        if self.answerDict.has_key(self.getID(sentence)):
                            sense = self.answerDict[self.getID(sentence)]
                            contextFlag = True
                else:    
                    if sentence[0] == "<answer":
                        sense = self.getSense(sentence)
                        contextFlag = True
            line = self.file.readline()
        if corpus == []:
            return None, aWord
        return corpus, aWord
    
    def getID(self, sentence):
        for block in sentence:
            if block[0:3] == "id=":
                return block.split('\"')[1]
        return "ERROR"
    
    def makeAnswerDict(self, fileName):
        try:
            file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        else:
            pass
        retDict = {}
        fileList = file.readlines()
        for line in fileList:
            sentence = line.split(' ')
            sense = sentence[1].split('.')[1]
            retDict[sentence[0]] = sense[:len(sense)-1]
        file.close()
        return retDict
        
    def makeDict(self, sentence):
        dict = {}
        sentence = sentence[1:len(sentence)]
        for block in sentence:
            pieces = block.split("=")
            dict[pieces[0]] = pieces[1].split('\"')[1]
        return dict
        
    def getWord(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[0]
        
    def getSense(self, sentence):
        block = sentence[2].split('\"')
        item = block[1].split(".")
        return item[1]
        
    def newSection(self, word):
        if word[0] == '<' and word[1] != '/':
            return True
        return False
    
    def getTag(self, word):
        block = word.split('_')[0]
        retVal = block.split('<')[1]
        return retVal
    
class BasqueParser:
    def __init__(self, arg, win, trainFile='/local/seed/katz/senseval/basque/basque.train',\
                 testFile='/local/seed/katz/senseval/basque/basque.test', \
                 answerFile='/local/seed/katz/senseval/basque/basque.answers'):
        self.winSize = win
        self.testFlag = False
        self.headFlag = False
        self.headCount = 0
        self.doneFlag = False
        if arg == 'test':
            self.testFlag = True
            fileName = testFile
        else:
            fileName = trainFile
        try:
            self.file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        if self.testFlag:
            self.answerDict = self.makeAnswerDict(answerFile)
    
    def __del__(self):
        self.file.close()
        
    def getNextSet(self):
        contextFlag = False
        corpus = []
        sense = []
        tCorpus = Buffer(self.winSize)
        line = self.file.readline()
        aWord = "None"
        while not '</lexelt>' in line and line != "":
            sentence = line.split(' ')
            if contextFlag:
                if len(sentence) > 4:
                    if sentence[4] == "<word":
                        thisDict = self.makeDict(sentence)
                        if self.headFlag:
                            if self.headCount > self.winSize/2.0:
                                self.doneFlag = True
                            self.headCount += 1 
                    if len(sentence) > 5:
                        if sentence[5][0:5] == "<lemm":
                            thisDict["lem"] = self.getMid(sentence[5])
                        if sentence[5][0:5] == "<toke":
                            thisDict["frm"] = self.getMid(sentence[5])
                    if not self.doneFlag:
                        if sentence[4][:7] == "</word>":
                            if thisDict.has_key('head'):
                                self.headFlag = True
                                thisDict['head'] = True
                                self.headCount = 0
                                if not thisDict.has_key("case") \
                                or thisDict["case"] == "":
                                    thisDict["case"] = "_None_"
                                thisDict['senses'] = sense
                                tCorpus.push(Entry(aWord, thisDict))
                            else:
                                tCorpus.push(Entry(thisDict["frm"], thisDict))
                if sentence[0] == "</postagging>\n":
                    i = 0
                    head = None
                    for entry in tCorpus.list():
                        if entry.tags.has_key('head'):
                            head = i
                        i += 1
                    if head == None:
                        print aWord
                        print [x.word for x in tCorpus.list()]
                    thisInstance = Instance(tCorpus.list(), head, tCorpus.list()[head])
                    corpus.append(thisInstance)
                    tCorpus = Buffer(self.winSize)
                    contextFlag = False
                    self.doneFlag = False
                    self.headFlag = False
                    self.headCount = 0
                    sense = []
            else:
                if sentence[0] == "<lexelt":
                    aWord = word = self.getWord(sentence)
                if self.testFlag:
                    if sentence[0] == "<instance":
                        sense = self.answerDict[self.getID(sentence)]
                        topic = self.getTopic(sentence)
                        contextFlag = True
                else:    
                    if sentence[0] == "<instance":
                        topic = self.getTopic(sentence)
                    if sentence[0][:11] == "</instance>":
                        contextFlag = True                        
                    if sentence[0] == "<answer":
                        sense.append(self.getSense(sentence))  
            line = self.file.readline()
        if corpus == []:
            return None, aWord
        return corpus, aWord 
    
    def getMid(self, word):
        mid = word.split('>')
        next = mid[1].split('<')
        return next[0]
    
    def getTopic(self, sentence):
        for block in sentence:
            if block[0:6] == "topic=":
                return "__TOPIC__" + block.split('\"')[1]
        return "ERROR"
    
    def getID(self, sentence):
        for block in sentence:
            if block[0:3] == "id=":
                return block.split('\"')[1]
        return "ERROR"
    
    def makeAnswerDict(self, fileName):
        try:
            file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        else:
            pass
        retDict = {}
        fileList = file.readlines()
        for line in fileList:
            sentence = line.split('\t')
            key = sentence[0]
            sentence = sentence[1:len(sentence)]
            retDict[key] = []
            while len(sentence) > 1:
                retDict[key].append(sentence[0])
                sentence = sentence[1:len(sentence)]
            retDict[key].append(sentence[0][:len(sentence[0])-1])
        file.close()
        return retDict
        
    def makeDict(self, sentence):
        dict = {}
        sentence = sentence[5:len(sentence)]
        for block in sentence:
            pieces = block.split("=")
            dict[pieces[0]] = pieces[1].split('\"')[1]
        return dict
        
    def getWord(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[0]
        
    def getSense(self, sentence):
        block = sentence[2].split('\"')
        return block[1]
        
    def newSection(self, word):
        if word[0] == '<' and word[1] != '/':
            return True
        return False
    
    def getTag(self, word):
        block = word.split('_')[0]
        retVal = block.split('<')[1]
        return retVal
    
class ItalianParser:
    def __init__(self, arg, win, trainFile='/local/seed/katz/senseval/italian/italian.train',\
                 testFile='/local/seed/katz/senseval/italian/italian.test', \
                 answerFile='/local/seed/katz/senseval/italian/italian.answers'):
        self.winSize = win
        self.testFlag = False
        self.headFlag = False
        self.headCount = 0
        self.doneFlag = False
        if arg == 'test':
            self.testFlag = True
            fileName = testFile
        else:
            fileName = trainFile
        try:
            self.file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        if self.testFlag:
            self.answerDict = self.makeAnswerDict(answerFile)
    
    def __del__(self):
        self.file.close()
        
    def getNextSet(self):
        contextFlag = False
        corpus = []
        tCorpus = Buffer(self.winSize)
        line = self.file.readline()
        aWord = "None"
        while line != "\n" and line != "":
            sentence = line.split(' ')
            if contextFlag:
                if len(sentence) > 1:
                    if sentence[1] == "<word":
                        if self.headFlag:
                            if self.headCount > self.winSize/2.0:
                                self.doneFlag = True
                            self.headCount += 1 
                        thisDict = self.makeDict(sentence)
                        if not thisDict.has_key('pos'):
                            thisDict['pos'] = "NONE"
                    if len(sentence) > 2:
                        if sentence[2][0:5] == "<lemm":
                            thisDict["lemmas"] = self.getMid(sentence[2])
                        if sentence[2][0:5] == "<toke":
                            thisDict["token"] = self.getMid(sentence[2])
                    if sentence[1][:7] == "</word>":
                        if not self.doneFlag:
                            if thisDict.has_key('annotated'):
                                self.headFlag = True
                                self.headCount = 0
                                if not thisDict.has_key("lemmas"):
                                    thisDict["lemmas"] = thisDict["token"]
                                thisDict['senses'] = [sense]
                                thisDict['head'] = True
                                del thisDict['annotated']
                                tCorpus.push(Entry(aWord, thisDict))
                            else:
                                if not thisDict.has_key("lemmas"):
                                    thisDict["lemmas"] = thisDict["token"]
                                tCorpus.push(Entry(thisDict["token"], thisDict))
                if sentence[0] == "\t\t\t</postagging>\n":
                    i = 0
                    head = None
                    for entry in tCorpus.list():
                        if entry.tags.has_key('head'):
                            head = i
                        i += 1
                    thisInstance = Instance(tCorpus.list(), head, tCorpus.list()[head])
                    corpus.append(thisInstance)
                    tCorpus = Buffer(self.winSize)
                    self.doneFlag = False
                    self.headFlag = False
                    self.headCount = 0
                    contextFlag = False
            else:
                if self.testFlag:
                    if sentence[0] == "\t\t<instance":
                        aWord = word = self.getWord(sentence)
                        sense = self.answerDict[self.getID(sentence)]
                        contextFlag = True
                else:    
                    if sentence[0] == "\t\t<instance":
                        aWord = word = self.getWord(sentence)
                    if sentence[0] == "\t\t\t<answer":
                        sense = self.getSense(sentence)
                        contextFlag = True
            line = self.file.readline()
        if corpus == []:
            return None, aWord
        return corpus, aWord
    
    def getMid(self, word):
        mid = word.split('>')
        next = mid[1].split('<')
        return next[0]
    
    def getID(self, sentence):
        for block in sentence:
            if block[0:3] == "id=":
                return block.split('\"')[1]
        return "ERROR"
    
    def makeAnswerDict(self, fileName):
        try:
            file = open(fileName, 'r')
        except IOError:
            print 'error opening file `%s` for reading\n' % fileName
        else:
            pass
        retDict = {}
        fileList = file.readlines()
        for line in fileList:
            sentence = line.split(' ')
            sense = sentence[1]
            retDict[sentence[0]] = sense[:len(sense)-2]
        file.close()
        return retDict
        
    def makeDict(self, sentence):
        dict = {}
        sentence = sentence[2:len(sentence)]
        for block in sentence:
            pieces = block.split("=")
            dict[pieces[0]] = pieces[1].split('\"')[1]
        return dict
        
    def getWord(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[0]
        
    def getSense(self, sentence):
        block = sentence[3].split('\"')
        return block[1]
        
    def newSection(self, word):
        if word[0] == '<' and word[1] != '/':
            return True
        return False
    
    def getTag(self, word):
        block = word.split('_')[0]
        retVal = block.split('<')[1]
        return retVal

class GoogleParser:
    
    def readFile(self, datadir, arg):
        for sensedir in os.listdir(datadir):
            for fileName in os.listdir(os.path.join(datadir, sensedir)):
                section = None
                contextFlag = False
                try:
                    file = open(os.path.join(datadir, sensedir, fileName), 'r')
                except IOError:
                    print 'error opening file `%s` for reading\n' % os.path.join(datadir, fileName)
                    continue
                fileList = file.readlines()
                file.close()
                corpus = []
                tCorpus = []
                for line in fileList:
                    sentence = line.split(' ')
                    if contextFlag:
                        if sentence[0] == "<w":
                            thisDict = self.makeDict(sentence)    
                            if thisDict.has_key('head'):
                                tCorpus.push(Entry(thisDict["frm"], thisDict["lem"], \
                                             thisDict["pos"], ["head", "sense%s" % sense, "lem"]))
                                (lemma, pos) = (thisDict["lem"], thisDict["pos"])
                            else:
                                tCorpus.push(Entry(thisDict["frm"], thisDict["lem"], \
                                                     thisDict["pos"], [section, "lem"]))
                        if sentence[0] == "</context>\n":
                            thisInstance = Instance(tCorpus, Entry(word, lemma, pos, \
                                                           ["sense%s" % sense, "lem"]))
                            corpus.append(thisInstance)
                            tCorpus = []
                            contextFlag = False
                            section = None
                    else:
                        if sentence[0] == "<lexelt":
                            word = self.getWord(sentence)
                        elif sentence[0] == "<context>":
                            contextFlag = True
                        elif sentence[0] == "<answer":
                            sense = self.getSense(sentence)
                            contextFlag = True
        return corpus
    
    def getID(self, sentence):
        for block in sentence:
            if block[0:3] == "id=":
                return block.split('\"')[1]
        return "ERROR"
    
    def makeDict(self, sentence):
        dict = {}
        sentence = sentence[1:len(sentence)]
        for block in sentence:
            pieces = block.split("=")
            if len(pieces) > 1:
                dict[pieces[0]] = pieces[1].split('\"')[1]
        dict['pos'] = 'UNK'
        return dict
        
    def getWord(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[0]
        
    def getSense(self, sentence):
        block = sentence[1].split('\"')
        item = block[1].split(".")
        return item[1]
        
    def newSection(self, word):
        if word[0] == '<' and word[1] != '/':
            return True
        return False
    
    def getTag(self, word):
        block = word.split('_')[0]
        retVal = block.split('<')[1]
        return retVal

