Prof. Walmes Marques Zeviani
30 Maio 2017
library(NLP)
ls("package:NLP")
## [1] "annotate" "AnnotatedPlainTextDocument"
## [3] "annotation" "Annotation"
## [5] "annotations_in_spans" "Annotator"
## [7] "Annotator_Pipeline" "as.Annotation"
## [9] "as.Annotator_Pipeline" "as.Span"
## [11] "as.Span_Tokenizer" "as.String"
## [13] "as.Tagged_Token" "as.Token_Tokenizer"
## [15] "blankline_tokenizer" "Brown_POS_tags"
## [17] "chunked_sents" "CoNLLTextDocument"
## [19] "CoNLLUTextDocument" "content"
## [21] "content<-" "features"
## [23] "is.Annotation" "is.Span"
## [25] "is.Span_Tokenizer" "is.String"
## [27] "is.Tagged_Token" "is.Token_Tokenizer"
## [29] "meta" "meta<-"
## [31] "next_id" "ngrams"
## [33] "paras" "parsed_paras"
## [35] "parsed_sents" "parse_IETF_language_tag"
## [37] "parse_ISO_8601_datetime" "Penn_Treebank_POS_tags"
## [39] "Regexp_Tokenizer" "sents"
## [41] "Simple_Chunk_Annotator" "Simple_Entity_Annotator"
## [43] "Simple_Para_Token_Annotator" "Simple_POS_Tag_Annotator"
## [45] "Simple_Sent_Token_Annotator" "Simple_Stem_Annotator"
## [47] "Simple_Word_Token_Annotator" "single_feature"
## [49] "Span" "Span_Tokenizer"
## [51] "String" "tagged_paras"
## [53] "tagged_sents" "TaggedTextDocument"
## [55] "Tagged_Token" "tagged_words"
## [57] "Token_Tokenizer" "Tree"
## [59] "Tree_apply" "Tree_parse"
## [61] "Universal_POS_tags" "Universal_POS_tags_map"
## [63] "whitespace_tokenizer" "WordListDocument"
## [65] "wordpunct_tokenizer" "words"
library(openNLP)
ls("package:openNLP")
## [1] "Maxent_Chunk_Annotator" "Maxent_Entity_Annotator"
## [3] "Maxent_POS_Tag_Annotator" "Maxent_Sent_Token_Annotator"
## [5] "Maxent_Word_Token_Annotator" "Parse_Annotator"
library(openNLPdata)
# ls("package:openNLPdata")
# Maxent_Sent_Token_Annotator(language = "pt")
# Error in Maxent_Simple_Sent_Tokenizer(language, probs, model) :
# Could not find model file for language 'pt'.
# Please make sure package 'openNLPmodels.pt' is installed,
# available from <http://datacube.wu.ac.at/>.
s <- "The book is on the table. The sky is blue. My cat ate your fish."
en_sen <- Maxent_Sent_Token_Annotator(language = "en",
probs = TRUE)
sen <- annotate(s = s, f = en_sen)
sen
## id type start end features
## 1 sentence 1 25 prob=0.9990691
## 2 sentence 27 42 prob=0.9880874
## 3 sentence 44 64 prob=0.986041
with(as.data.frame(sen),
substr(rep(s, length(id)),
start = start,
stop = end))
## [1] "The book is on the table." "The sky is blue."
## [3] "My cat ate your fish."
en_word <- Maxent_Word_Token_Annotator(language = "en",
probs = TRUE)
word <- annotate(s = s, f = en_word, a = sen)
word
## id type start end features
## 1 sentence 1 25 prob=0.9990691,
## constituents=<<integer,7>>
## 2 sentence 27 42 prob=0.9880874,
## constituents=<<integer,5>>
## 3 sentence 44 64 prob=0.986041,
## constituents=<<integer,6>>
## 4 word 1 3 prob=1
## 5 word 5 8 prob=1
## 6 word 10 11 prob=1
## 7 word 13 14 prob=1
## 8 word 16 18 prob=1
## 9 word 20 24 prob=0.9942408
## 10 word 25 25 prob=1
## 11 word 27 29 prob=1
## 12 word 31 33 prob=1
## 13 word 35 36 prob=1
## 14 word 38 41 prob=0.9955833
## 15 word 42 42 prob=1
## 16 word 44 45 prob=1
## 17 word 47 49 prob=1
## 18 word 51 53 prob=1
## 19 word 55 58 prob=1
## 20 word 60 63 prob=0.9771182
## 21 word 64 64 prob=1
en_pos <- Maxent_POS_Tag_Annotator(language = "en",
probs = TRUE)
pos <- annotate(s = s, f = en_pos, a = word)
pos
## id type start end features
## 1 sentence 1 25 prob=0.9990691,
## constituents=<<integer,7>>
## 2 sentence 27 42 prob=0.9880874,
## constituents=<<integer,5>>
## 3 sentence 44 64 prob=0.986041,
## constituents=<<integer,6>>
## 4 word 1 3 prob=1, POS=DT, POS_prob=0.9895104
## 5 word 5 8 prob=1, POS=NN, POS_prob=0.9960107
## 6 word 10 11 prob=1, POS=VBZ, POS_prob=0.9951052
## 7 word 13 14 prob=1, POS=IN, POS_prob=0.9910109
## 8 word 16 18 prob=1, POS=DT, POS_prob=0.9952673
## 9 word 20 24 prob=0.9942408, POS=NN,
## POS_prob=0.9778091
## 10 word 25 25 prob=1, POS=., POS_prob=0.9912864
## 11 word 27 29 prob=1, POS=DT, POS_prob=0.9897729
## 12 word 31 33 prob=1, POS=NN, POS_prob=0.9934866
## 13 word 35 36 prob=1, POS=VBZ, POS_prob=0.9834103
## 14 word 38 41 prob=0.9955833, POS=JJ,
## POS_prob=0.8635544
## 15 word 42 42 prob=1, POS=., POS_prob=0.9895226
## 16 word 44 45 prob=1, POS=PRP$, POS_prob=0.929196
## 17 word 47 49 prob=1, POS=NN, POS_prob=0.9711728
## 18 word 51 53 prob=1, POS=VBD, POS_prob=0.1828995
## 19 word 55 58 prob=1, POS=PRP$, POS_prob=0.9298585
## 20 word 60 63 prob=0.9771182, POS=NN,
## POS_prob=0.969467
## 21 word 64 64 prob=1, POS=., POS_prob=0.9865147
L <- as.list(pos)
str(L[[5]])
## Classes 'Annotation', 'Span' hidden list of 5
## $ id : int 5
## $ type : chr "word"
## $ start : int 5
## $ end : int 8
## $ features:List of 1
## ..$ :List of 3
## .. ..$ prob : num 1
## .. ..$ POS : chr "NN"
## .. ..$ POS_prob: num 0.996
## - attr(*, "meta")=List of 2
## ..$ POS_tagset : chr "en-ptb"
## ..$ POS_tagset_URL: chr "http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html"
da <- as.data.frame(pos)
da <- subset(da, type == "word")
da
## id type start end features
## 4 4 word 1 3 1, DT, 0.989510379435813
## 5 5 word 5 8 1, NN, 0.996010731782666
## 6 6 word 10 11 1, VBZ, 0.995105150195623
## 7 7 word 13 14 1, IN, 0.991010865026466
## 8 8 word 16 18 1, DT, 0.995267288891455
## 9 9 word 20 24 0.994240782176942, NN, 0.977809130528209
## 10 10 word 25 25 1, ., 0.991286374901935
## 11 11 word 27 29 1, DT, 0.989772882130157
## 12 12 word 31 33 1, NN, 0.99348656360271
## 13 13 word 35 36 1, VBZ, 0.983410250476636
## 14 14 word 38 41 0.995583348038642, JJ, 0.863554431854847
## 15 15 word 42 42 1, ., 0.989522585087634
## 16 16 word 44 45 1, PRP$, 0.929195992279972
## 17 17 word 47 49 1, NN, 0.971172754645763
## 18 18 word 51 53 1, VBD, 0.182899549187972
## 19 19 word 55 58 1, PRP$, 0.929858519457671
## 20 20 word 60 63 0.977118228700449, NN, 0.969466962349041
## 21 21 word 64 64 1, ., 0.986514678030217
# Extrai só os rótulos e as palavras.
p <- sapply(da$features, "[[", "POS")
u <- mapply(FUN = substr,
start = da$start,
stop = da$end,
MoreArgs = list(x = s))
data.frame(palavra = u,
rotulo = p,
stringsAsFactors = FALSE)
## palavra rotulo
## 1 The DT
## 2 book NN
## 3 is VBZ
## 4 on IN
## 5 the DT
## 6 table NN
## 7 . .
## 8 The DT
## 9 sky NN
## 10 is VBZ
## 11 blue JJ
## 12 . .
## 13 My PRP$
## 14 cat NN
## 15 ate VBD
## 16 your PRP$
## 17 fish NN
## 18 . .