Simplificación del conjunto de etiquetas POS francesas con NLTK

Question

Dec 16, 2014, 09:19 PM

Simplificación del conjunto de etiquetas POS francesas con NLTK

¿Cómo se puede simplificar la parte de las etiquetas de voz devueltas por el etiquetador POS francés de Stanford? Es bastante fácil leer una oración en inglés en NLTK, encontrar la parte del discurso de cada palabra, luego usar map_tag () para simplificar el conjunto de etiquetas:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

english = u"the whole earth swarms with living beings, every plant, every grain and leaf, supports the life of thousands."

path_to_english_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\english-bidirectional-distsim.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

#define english and french taggers
english_tagger = POSTagger(path_to_english_model, path_to_jar, encoding="utf-8")

#each tuple in list_of_english_pos_tuples = (word, pos)
list_of_english_pos_tuples = english_tagger.tag(word_tokenize(english))

simplified_pos_tags_english = [(word, map_tag('en-ptb', 'universal', tag)) for word, tag in list_of_english_pos_tuples]

print simplified_pos_tags_english

#output = [(u'the', u'DET'), (u'whole', u'ADJ'), (u'earth', u'NOUN'), (u'swarms', u'NOUN'), (u'with', u'ADP'), (u'living', u'NOUN'), (u'beings', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'plant', u'NOUN'), (u',', u'.'), (u'every', u'DET'), (u'grain', u'NOUN'), (u'and', u'CONJ'), (u'leaf', u'NOUN'), (u',', u'.'), (u'supports', u'VERB'), (u'the', u'DET'), (u'life', u'NOUN'), (u'of', u'ADP'), (u'thousands', u'NOUN'), (u'.', u'.')]

Pero no estoy seguro de cómo asignar las etiquetas francesas devueltas por el siguiente código al conjunto de etiquetas universal:

#!/usr/bin/python
# -*- coding: utf-8 -*-

import os
from nltk.tag.stanford import POSTagger
from nltk.tokenize import word_tokenize
from nltk.tag import map_tag

#set java_home path from within script. Run os.getenv("JAVA_HOME") to test java_home
os.environ["JAVA_HOME"] = "C:\\Program Files\\Java\\jdk1.7.0_25\\bin"

french = u"Chaque plante, chaque graine, chaque particule de matière organique contient des milliers d'atomes animés."

path_to_french_model = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\models\\french.tagger"
path_to_jar = "C:\\Text\\Professional\\Digital Humanities\\Packages and Tools\\Stanford Packages\\stanford-postagger-full-2014-08-27\\stanford-postagger-full-2014-08-27\\stanford-postagger.jar"

french_tagger = POSTagger(path_to_french_model, path_to_jar, encoding="utf-8")

list_of_french_pos_tuples = french_tagger.tag(word_tokenize(french))

#up to this point all is well, but I'm not sure how to successfully create a simplified pos tagset with the French tuples
simplified_pos_tags_french = [(word, map_tag('SOME_ARGUMENT', 'universal', tag)) for word, tag in list_of_french_pos_tuples]
print simplified_pos_tags_french

¿Alguien sabe cómo simplificar el conjunto de etiquetas predeterminado utilizado por el modelo francés en el etiquetador POS de Stanford? Agradecería cualquier idea que otros puedan ofrecer sobre esta pregunta.