Training the part-of-speech tagger with spaCy

Posted on Wed 21 December 2016 in spaCy

In [1]:
# import
from __future__ import unicode_literals
from __future__ import print_function

import plac
from pathlib import Path

from spacy.vocab import Vocab
from spacy.tagger import Tagger
from spacy.tokens import Doc
from spacy.gold import GoldParse

import random
In [2]:
TAG_MAP = {
    'N': {"pos": "NOUN"},
    'V': {"pos": "VERB"},
    'J': {"pos": "ADJ"}
}
In [3]:
DATA = [
    (
        ["I", "like", "green", "eggs"],
        ["N", "V", "J", "N"]
    ),
    (
        ["Eat", "blue", "ham"],
        ["V", "J", "N"]
    )
]
In [4]:
def ensure_dir(path):
    if not path.exists():
        path.mkdir()
In [5]:
def main(output_dir=None):
    if output_dir is not None:
        output_dir = Path(output_dir)
        ensure_dir(output_dir)
        ensure_dir(output_dir / "pos")
        ensure_dir(output_dir / "vocab")

    vocab = Vocab(tag_map=TAG_MAP)
    # The default_templates argument is where features are specified. See
    # spacy/tagger.pyx for the defaults.
    tagger = Tagger(vocab)
    for i in range(25):
        for words, tags in DATA:
            doc = Doc(vocab, words=words)
            gold = GoldParse(doc, tags=tags)
            tagger.update(doc, gold)
        random.shuffle(DATA)
    tagger.model.end_training()
    doc = Doc(vocab, orths_and_spaces=zip(["I", "like", "blue", "eggs"], [True] * 4))
    tagger(doc)
    for word in doc:
        print(word.text, word.tag_, word.pos_)
    if output_dir is not None:
        tagger.model.dump(str(output_dir / 'pos' / 'model'))
        with (output_dir / 'vocab' / 'strings.json').open('w') as file_:
            tagger.vocab.strings.dump(file_)
In [6]:
if __name__ == '__main__':
    main()
I SP SPACE
like SP SPACE
blue J ADJ
eggs N NOUN