Rotulagem das partes do discurso

Detector de sentenças

library(NLP)
ls("package:NLP")

##  [1] "annotate"                    "AnnotatedPlainTextDocument" 
##  [3] "annotation"                  "Annotation"                 
##  [5] "annotations_in_spans"        "Annotator"                  
##  [7] "Annotator_Pipeline"          "as.Annotation"              
##  [9] "as.Annotator_Pipeline"       "as.Span"                    
## [11] "as.Span_Tokenizer"           "as.String"                  
## [13] "as.Tagged_Token"             "as.Token_Tokenizer"         
## [15] "blankline_tokenizer"         "Brown_POS_tags"             
## [17] "chunked_sents"               "CoNLLTextDocument"          
## [19] "CoNLLUTextDocument"          "content"                    
## [21] "content<-"                   "features"                   
## [23] "is.Annotation"               "is.Span"                    
## [25] "is.Span_Tokenizer"           "is.String"                  
## [27] "is.Tagged_Token"             "is.Token_Tokenizer"         
## [29] "meta"                        "meta<-"                     
## [31] "next_id"                     "ngrams"                     
## [33] "paras"                       "parsed_paras"               
## [35] "parsed_sents"                "parse_IETF_language_tag"    
## [37] "parse_ISO_8601_datetime"     "Penn_Treebank_POS_tags"     
## [39] "Regexp_Tokenizer"            "sents"                      
## [41] "Simple_Chunk_Annotator"      "Simple_Entity_Annotator"    
## [43] "Simple_Para_Token_Annotator" "Simple_POS_Tag_Annotator"   
## [45] "Simple_Sent_Token_Annotator" "Simple_Stem_Annotator"      
## [47] "Simple_Word_Token_Annotator" "single_feature"             
## [49] "Span"                        "Span_Tokenizer"             
## [51] "String"                      "tagged_paras"               
## [53] "tagged_sents"                "TaggedTextDocument"         
## [55] "Tagged_Token"                "tagged_words"               
## [57] "Token_Tokenizer"             "Tree"                       
## [59] "Tree_apply"                  "Tree_parse"                 
## [61] "Universal_POS_tags"          "Universal_POS_tags_map"     
## [63] "whitespace_tokenizer"        "WordListDocument"           
## [65] "wordpunct_tokenizer"         "words"

library(openNLP)
ls("package:openNLP")

## [1] "Maxent_Chunk_Annotator"      "Maxent_Entity_Annotator"    
## [3] "Maxent_POS_Tag_Annotator"    "Maxent_Sent_Token_Annotator"
## [5] "Maxent_Word_Token_Annotator" "Parse_Annotator"

library(openNLPdata)
# ls("package:openNLPdata")

# Maxent_Sent_Token_Annotator(language = "pt")
# Error in Maxent_Simple_Sent_Tokenizer(language, probs, model) :
#   Could not find model file for language 'pt'.
# Please make sure package 'openNLPmodels.pt' is installed,
# available from <http://datacube.wu.ac.at/>.

s <- "The book is on the table. The sky is blue. My cat ate your fish."

en_sen <- Maxent_Sent_Token_Annotator(language = "en",
                                      probs = TRUE)

sen <- annotate(s = s, f = en_sen)
sen

##  id type     start end features
##   1 sentence     1  25 prob=0.9990691
##   2 sentence    27  42 prob=0.9880874
##   3 sentence    44  64 prob=0.986041

with(as.data.frame(sen),
     substr(rep(s, length(id)),
            start = start,
            stop = end))

## [1] "The book is on the table." "The sky is blue."         
## [3] "My cat ate your fish."

id: o indentificador da sentença encontrada.
start: digito onde começa.
end: digito onde termina.
features: probalilidade de acerto na sentença dectada.

Rotulador das partes do discurso

en_word <- Maxent_Word_Token_Annotator(language = "en",
                                       probs = TRUE)

word <- annotate(s = s, f = en_word, a = sen)
word

##  id type     start end features
##   1 sentence     1  25 prob=0.9990691,
##                        constituents=<<integer,7>>
##   2 sentence    27  42 prob=0.9880874,
##                        constituents=<<integer,5>>
##   3 sentence    44  64 prob=0.986041,
##                        constituents=<<integer,6>>
##   4 word         1   3 prob=1
##   5 word         5   8 prob=1
##   6 word        10  11 prob=1
##   7 word        13  14 prob=1
##   8 word        16  18 prob=1
##   9 word        20  24 prob=0.9942408
##  10 word        25  25 prob=1
##  11 word        27  29 prob=1
##  12 word        31  33 prob=1
##  13 word        35  36 prob=1
##  14 word        38  41 prob=0.9955833
##  15 word        42  42 prob=1
##  16 word        44  45 prob=1
##  17 word        47  49 prob=1
##  18 word        51  53 prob=1
##  19 word        55  58 prob=1
##  20 word        60  63 prob=0.9771182
##  21 word        64  64 prob=1

en_pos <-  Maxent_POS_Tag_Annotator(language = "en",
                                    probs = TRUE)

pos <- annotate(s = s, f = en_pos, a = word)
pos

##  id type     start end features
##   1 sentence     1  25 prob=0.9990691,
##                        constituents=<<integer,7>>
##   2 sentence    27  42 prob=0.9880874,
##                        constituents=<<integer,5>>
##   3 sentence    44  64 prob=0.986041,
##                        constituents=<<integer,6>>
##   4 word         1   3 prob=1, POS=DT, POS_prob=0.9895104
##   5 word         5   8 prob=1, POS=NN, POS_prob=0.9960107
##   6 word        10  11 prob=1, POS=VBZ, POS_prob=0.9951052
##   7 word        13  14 prob=1, POS=IN, POS_prob=0.9910109
##   8 word        16  18 prob=1, POS=DT, POS_prob=0.9952673
##   9 word        20  24 prob=0.9942408, POS=NN,
##                        POS_prob=0.9778091
##  10 word        25  25 prob=1, POS=., POS_prob=0.9912864
##  11 word        27  29 prob=1, POS=DT, POS_prob=0.9897729
##  12 word        31  33 prob=1, POS=NN, POS_prob=0.9934866
##  13 word        35  36 prob=1, POS=VBZ, POS_prob=0.9834103
##  14 word        38  41 prob=0.9955833, POS=JJ,
##                        POS_prob=0.8635544
##  15 word        42  42 prob=1, POS=., POS_prob=0.9895226
##  16 word        44  45 prob=1, POS=PRP$, POS_prob=0.929196
##  17 word        47  49 prob=1, POS=NN, POS_prob=0.9711728
##  18 word        51  53 prob=1, POS=VBD, POS_prob=0.1828995
##  19 word        55  58 prob=1, POS=PRP$, POS_prob=0.9298585
##  20 word        60  63 prob=0.9771182, POS=NN,
##                        POS_prob=0.969467
##  21 word        64  64 prob=1, POS=., POS_prob=0.9865147

L <- as.list(pos)
str(L[[5]])

## Classes 'Annotation', 'Span'  hidden list of 5
##  $ id      : int 5
##  $ type    : chr "word"
##  $ start   : int 5
##  $ end     : int 8
##  $ features:List of 1
##   ..$ :List of 3
##   .. ..$ prob    : num 1
##   .. ..$ POS     : chr "NN"
##   .. ..$ POS_prob: num 0.996
##  - attr(*, "meta")=List of 2
##   ..$ POS_tagset    : chr "en-ptb"
##   ..$ POS_tagset_URL: chr "http://www.comp.leeds.ac.uk/ccalas/tagsets/upenn.html"

da <- as.data.frame(pos)
da <- subset(da, type == "word")
da

##    id type start end                                 features
## 4   4 word     1   3                 1, DT, 0.989510379435813
## 5   5 word     5   8                 1, NN, 0.996010731782666
## 6   6 word    10  11                1, VBZ, 0.995105150195623
## 7   7 word    13  14                 1, IN, 0.991010865026466
## 8   8 word    16  18                 1, DT, 0.995267288891455
## 9   9 word    20  24 0.994240782176942, NN, 0.977809130528209
## 10 10 word    25  25                  1, ., 0.991286374901935
## 11 11 word    27  29                 1, DT, 0.989772882130157
## 12 12 word    31  33                  1, NN, 0.99348656360271
## 13 13 word    35  36                1, VBZ, 0.983410250476636
## 14 14 word    38  41 0.995583348038642, JJ, 0.863554431854847
## 15 15 word    42  42                  1, ., 0.989522585087634
## 16 16 word    44  45               1, PRP$, 0.929195992279972
## 17 17 word    47  49                 1, NN, 0.971172754645763
## 18 18 word    51  53                1, VBD, 0.182899549187972
## 19 19 word    55  58               1, PRP$, 0.929858519457671
## 20 20 word    60  63 0.977118228700449, NN, 0.969466962349041
## 21 21 word    64  64                  1, ., 0.986514678030217

# Extrai só os rótulos e as palavras.
p <- sapply(da$features, "[[", "POS")
u <- mapply(FUN = substr,
            start = da$start,
            stop = da$end,
            MoreArgs = list(x = s))
data.frame(palavra = u,
           rotulo = p,
           stringsAsFactors = FALSE)

##    palavra rotulo
## 1      The     DT
## 2     book     NN
## 3       is    VBZ
## 4       on     IN
## 5      the     DT
## 6    table     NN
## 7        .      .
## 8      The     DT
## 9      sky     NN
## 10      is    VBZ
## 11    blue     JJ
## 12       .      .
## 13      My   PRP$
## 14     cat     NN
## 15     ate    VBD
## 16    your   PRP$
## 17    fish     NN
## 18       .      .

Dicionário dos rótulos: http://www.comp.leeds.ac.uk/amalgam/tagsets/upenn.html.

Rotulagem das partes do discurso

Justificativa e objetivos

Rotulagem

Como é feita

Detector de sentenças

Rotulador das partes do discurso

Próxima aula

Referências