ํ์ฌ CoreNLPParser.tag()
๋ฅผ ์ฌ์ฉํ๋ฉด Stanford CoreNLP์ "์ฌ ํ ํฐํ"๊ฐ ์๊ธฐ์น ์๊ฒ ๋ฐ์ํฉ๋๋ค.
>>> from nltk.parse.corenlp import CoreNLPParser
>>> ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> sent = ['my', 'phone', 'number', 'is', '1111', '1111', '1111']
>>> ner_tagger.tag(sent)
[('my', 'O'),
('phone', 'O'),
('number', 'O'),
('is', 'O'),
('1111\xa01111\xa01111', 'NUMBER')]
์์๋๋ ๋์์ ๋ค์๊ณผ ๊ฐ์์ผ ํฉ๋๋ค.
>>> from nltk.parse.corenlp import CoreNLPParser
>>> ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> sent = ['my', 'phone', 'number', 'is', '1111', '1111', '1111']
>>> ner_tagger.tag(sent)
[('my', 'O'), ('phone', 'O'), ('number', 'O'), ('is', 'O'), ('1111', 'DATE'), ('1111', 'DATE'), ('1111', 'DATE')]
์ ์๋ ์๋ฃจ์
์ .tag()
๋ฐ .tag_sents()
๋ํ properties
์ธ์ ์ค๋ฒ๋ก๋ฉ์ ํ์ฉํ๋ ๊ฒ์
๋๋ค(์: https://github.com/nltk/nltk/blob/develop/nltk/parse/). corenlp.py#L348 ๋ฐ ๊ธฐ๋ณธ์ ์ผ๋ก properties = {'tokenize.whitespace':'true'}
๋ฅผ ์ฌ์ฉํฉ๋๋ค. tag_sents()
๊ณต๋ฐฑ์ผ๋ก ํ ํฐ์ ์ฐ๊ฒฐํ๊ธฐ ๋๋ฌธ์
๋๋ค.
def tag_sents(self, sentences, properties=None):
"""
Tag multiple sentences.
Takes multiple sentences as a list where each sentence is a list of
tokens.
:param sentences: Input sentences to tag
:type sentences: list(list(str))
:rtype: list(list(tuple(str, str))
"""
# Converting list(list(str)) -> list(str)
sentences = (' '.join(words) for words in sentences)
if properties == None:
properties = {'tokenize.whitespace':'true'}
return [sentences[0] for sentences in self.raw_tag_sents(sentences, properties)]
def tag(self, sentence, properties=None):
"""
Tag a list of tokens.
:rtype: list(tuple(str, str))
>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='ner')
>>> tokens = 'Rami Eid is studying at Stony Brook University in NY'.split()
>>> parser.tag(tokens)
[('Rami', 'PERSON'), ('Eid', 'PERSON'), ('is', 'O'), ('studying', 'O'), ('at', 'O'), ('Stony', 'ORGANIZATION'),
('Brook', 'ORGANIZATION'), ('University', 'ORGANIZATION'), ('in', 'O'), ('NY', 'O')]
>>> parser = CoreNLPParser(url='http://localhost:9000', tagtype='pos')
>>> tokens = "What is the airspeed of an unladen swallow ?".split()
>>> parser.tag(tokens)
[('What', 'WP'), ('is', 'VBZ'), ('the', 'DT'),
('airspeed', 'NN'), ('of', 'IN'), ('an', 'DT'),
('unladen', 'JJ'), ('swallow', 'VB'), ('?', '.')]
"""
return self.tag_sents([sentence], properties)[0]
def raw_tag_sents(self, sentences, properties=None):
"""
Tag multiple sentences.
Takes multiple sentences as a list where each sentence is a string.
:param sentences: Input sentences to tag
:type sentences: list(str)
:rtype: list(list(list(tuple(str, str)))
"""
default_properties = {'ssplit.isOneSentence': 'true',
'annotators': 'tokenize,ssplit,' }
default_properties.update(properties or {})
# Supports only 'pos' or 'ner' tags.
assert self.tagtype in ['pos', 'ner']
default_properties['annotators'] += self.tagtype
for sentence in sentences:
tagged_data = self.api_call(sentence, properties=default_properties)
yield [[(token['word'], token[self.tagtype]) for token in tagged_sentence['tokens']]
for tagged_sentence in tagged_data['sentences']]
์ฌ์ฉ์๊ฐ ์ ๋ ฅํ ๋ฌธ์์ด ํ ํฐ ๋ชฉ๋ก์ ์ ์ฉํด์ผ ํฉ๋๋ค.
https://stackoverflow.com/questions/52250268/why-do-corenlp-ner-tagger-and-ner-tagger-join-the-separated-numbers-together ์ ๋ํ ์ธ๋ถ ์ ๋ณด
.tag()
๊ฐ raw_tag_sents
์ ์ ์์ฑ์ ์ค๋ฒ๋ก๋ํ๋๋ก ํ์ฉํ๋ฉด ์ฌ์ฉ์๊ฐ #1876๊ณผ ๊ฐ์ ๊ฒฝ์ฐ๋ฅผ ์ฝ๊ฒ ์ฒ๋ฆฌํ ์ ์์ต๋๋ค.
์ข์ ๋ณด์ธ๋ค.
๋ช ๊ฐ์ง ์ฌ์ํ ์๊ฒฌ์
๋๋ค. if properties is None
๊ฐ ์๋๋ผ if properties == None
if properties is None
์ด์ด์ผ ํฉ๋๋ค. assert self.tagtype in ['pos', 'ner']
๋ assert self.tagtype in ['pos', 'ner'], "CoreNLP tagger supports only 'pos' or 'ner' tags."
์ด์ด์ผ ํฉ๋๋ค.
๋ฌธ์์ด์ ๊ฒฐํฉํ๊ณ ๋ถํ ํ๋ ์์ด๋์ด๊ฐ ์ ๋ง ๋ง์์ ๋ค์ง ์์ต๋๋ค. ๊ฐ๋จํ ๋ฌธ์์ด ๋์ ์ CoreNLP์ ๋จ์ด ๋ชฉ๋ก์ ๋ฌธ์ฅ์ผ๋ก ์ ๋ฌํ๋ ๋ฐฉ๋ฒ์ด ์์ ์ ์์ต๋๋ค.
์๋ ํ์ธ์, ์ด ๋ฌธ์ ๋ฅผ ์ฒซ ๋ฒ์งธ ๋ฌธ์ ๋ก ์ผ๊ณ ์ถ์ต๋๋ค.
๋น์ ์ด ๋ฌธ์ ์ ๊ด์ฌ์ด ์๋ค๋ ๊ฒ์ ๋๋จํ ์ผ์ ๋๋ค. ์ง๋ฌธ์ด ์์ผ๋ฉด ์ฌ๊ธฐ์์ ์ง๋ฌธํ์ธ์.
๊ฐ์ฅ ์ ์ฉํ ๋๊ธ
์ข์ ๋ณด์ธ๋ค.
๋ช ๊ฐ์ง ์ฌ์ํ ์๊ฒฌ์ ๋๋ค.
if properties is None
๊ฐ ์๋๋ผif properties == None
if properties is None
์ด์ด์ผ ํฉ๋๋ค.assert self.tagtype in ['pos', 'ner']
๋assert self.tagtype in ['pos', 'ner'], "CoreNLP tagger supports only 'pos' or 'ner' tags."
์ด์ด์ผ ํฉ๋๋ค.๋ฌธ์์ด์ ๊ฒฐํฉํ๊ณ ๋ถํ ํ๋ ์์ด๋์ด๊ฐ ์ ๋ง ๋ง์์ ๋ค์ง ์์ต๋๋ค. ๊ฐ๋จํ ๋ฌธ์์ด ๋์ ์ CoreNLP์ ๋จ์ด ๋ชฉ๋ก์ ๋ฌธ์ฅ์ผ๋ก ์ ๋ฌํ๋ ๋ฐฉ๋ฒ์ด ์์ ์ ์์ต๋๋ค.