-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathAula5_exemploe_simples_LDA.R
207 lines (160 loc) · 9.49 KB
/
Aula5_exemploe_simples_LDA.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
###################################
#### Manoel Galdino 19/07.2014 ####
#### Aula 5 - LDA/Parte 1 ####
###################################
##########################################
## Começando com um exemplo bem simples ##
##########################################
# Latent Dirichlet Allocation
## exemplo do próprio R
# pacotes
library(XML)
if(require(MCMCpack) == F){install.packages("MCMCpack"); library(MCMCpack)}
if(require(tm) == F){install.packages('tm'); library(tm)}
if(require(SnowballC) == F){install.packages('SnowballC'); library(SnowballC)}
if(require(data.table) == F){install.packages('data.table'); library(data.table)}
if(require(ggplot2) == F){install.packages('ggplot2'); library(ggplot2)}
if(require(seqinr) == F) {install.packages("seqinr"); library(seqinr)}
if(require(RTextTools) == F) {install.packages("RTextTools"); library(RTextTools)}
if(require(topicmodels) == F) {install.packages("topicmodels"); library(topicmodels)}
data(AssociatedPress)
AssociatedPress
dim(AssociatedPress)
# divide o banco entre dados de treinamento
# e dados de teste ou validação
train <- AssociatedPress[1:100]
test <- AssociatedPress[101:150]
save(train, file= "trainAP.RData")
setwd("D:/2014/aulas/IESP/scripts/Curso-IESP-R-aula")
load("trainAP.RData")
dim(train)
# estima o modelo
train.lda <- LDA(train,5)
# extrai os tópicos
(train.topics <- topics(train.lda))
# calcula prob para novos documentos, com base no modelo
# cross-validação...
test.topics <- posterior(train.lda,test)
# escolhe o tópico com maior prob. como tópico dos novos docs.
(test.topics1 <- apply(test.topics$topics, 1, which.max))
# resultado com docs nas linhas
# topicos nas colunas
# prob nas células
test.topics[[2]]
#################################
## Ex. 2, mais completo ##
## Carregando manchetes do NYT ##
#################################
data(NYTimes)
dim(NYTimes)
nyt <- NYTimes
extendedstopwords <- c("a","about","above","across","after",
"again","against","all","almost",
"alone","along","already","also",
"although","always","am","among",
"an","and","another","any","anybody","anyone","anything","anywhere","are","area","areas","aren't","around","as","ask","asked","asking","asks","at","away","b","back","backed","backing","backs","be","became","because","become","becomes","been","before","began","behind","being","beings","below","best","better","between","big","both","but","by","c","came","can","cannot","can't","case","cases","certain","certainly","clear","clearly","come","could","couldn't","d","did","didn't","differ","different","differently","do","does","doesn't","doing","done","don't","down","downed","downing","downs","during","e","each","early","either","end","ended","ending","ends","enough","even","evenly","ever","every","everybody","everyone","everything","everywhere","f","face","faces","fact","facts","far","felt","few","find","finds","first","for","four","from","full","fully","further","furthered","furthering","furthers","g","gave","general","generally","get","gets","give","given","gives","go","going","good","goods","got","great","greater","greatest","group","grouped","grouping","groups","h","had","hadn't","has","hasn't","have","haven't","having","he","he'd","he'll","her","here","here's","hers","herself","he's","high","higher","highest","him","himself","his","how","however","how's","i","i'd","if","i'll","i'm","important","in","interest","interested","interesting","interests","into","is","isn't","it","its","it's","itself","i've","j","just","k","keep","keeps","kind","knew","know","known","knows","l","large","largely","last","later","latest","least","less","let","lets","let's","like","likely","long","longer","longest","m","made","make","making","man","many","may","me","member","members","men","might","more","most","mostly","mr","mrs","much","must","mustn't","my","myself","n","necessary","need","needed","needing","needs","never","new","newer","newest","next","no","nobody","non","noone","nor","not","nothing","now","nowhere","number","numbers","o","of","off","often","old","older","oldest","on","once","one","only","open","opened","opening","opens","or","order","ordered","ordering","orders","other","others","ought","our","ours","ourselves","out","over","own","p","part","parted","parting","parts","per","perhaps","place","places","point","pointed","pointing","points","possible","present","presented","presenting","presents","problem","problems","put","puts","q","quite","r","rather","really","right","room","rooms","s","said","same","saw","say","says","second","seconds","see","seem","seemed","seeming","seems","sees","several","shall","shan't","she","she'd","she'll","she's","should","shouldn't","show","showed","showing","shows","side","sides","since","small","smaller","smallest","so","some","somebody","someone","something","somewhere","state","states","still","such","sure","t","take","taken","than","that","that's","the","their","theirs","them","themselves","then","there","therefore","there's","these","they","they'd","they'll","they're","they've","thing","things","think","thinks","this","those","though","thought","thoughts","three","through","thus","to","today","together","too","took","toward","turn","turned","turning","turns","two","u","under","until","up","upon","us","use","used","uses","v","very","w","want","wanted","wanting","wants","was","wasn't","way","ways","we","we'd","well","we'll","wells","went","were","we're","weren't","we've","what","what's","when","when's","where","where's","whether","which","while","who","whole","whom","who's","whose","why","why's","will","with","within","without","won't","work","worked","working","works","would","wouldn't","x","y","year","years","yes","yet","you","you'd","you'll","young","younger","youngest","your","you're","yours","yourself","yourselves","you've","z")
nyt1 <- nyt[,3]
length(nyt1)
dtm.control <- list(
tolower = T,
removePunctuation = T,
removeNumbers = T,
stopwords = c(stopwords("english"),extendedstopwords),
# snowball tem stopwords em pt
stemming = T,
wordLengths = c(3,Inf),
weighting = weightTf
)
# do pacote tm
nyt1 <- Corpus(VectorSource(nyt1), readerControl = dtm.control)
class(nyt1)
inspect(nyt1)
# # Limpando no R 3.0
# nyt1 <- tm_map(nyt1, stemDocument, language="english")
# nyt1 <- tm_map(nyt1, tolower) # convertendo para caixa baixa
# nyt1 <- tm_map(nyt1, removeWords, extendedstopwords) # removendo palavras escolhidas
# nyt1 <- tm_map(nyt1, removeNumbers) # removendo numeros
# nyt1 <- tm_map(nyt1, removePunctuation) # removendo pontuacao
# nyt1 <- tm_map(nyt1, removeWords, stopwords("english")) # removendo stop words (snowball)
#
# # Limpando no R 3.1.1
# #nyt1 <- tm_map(nyt1, content_transformer(stemDocument, language="english"))
# nyt1 <- tm_map(nyt1, content_transformer(tolower)) # convertendo para caixa baixa
# #nyt1 <- tm_map(nyt1, content_transformer(removeWords, extendedstopwords)) # removendo palavras escolhidas
# nyt1 <- tm_map(nyt1, content_transformer(removeNumbers)) # removendo numeros
# nyt1 <- tm_map(nyt1, content_transformer(removePunctuation)) # removendo pontuacao
# #nyt1 <- tm_map(nyt1, content_transformer(removeWords(stopwords("english")))) # removendo stop words (snowball)
# criando uma DocumentTermMatrix
# é usada como argumento de LDA
dtm <- DocumentTermMatrix(nyt1)
dim(dtm)
## cada doc é uma linha
# cada palavra uma coluna
# número de colunas e igual número de palavras únicas em todos os docs
View(dtm) ## erro. Tem que usar inspect
inspect(dtm[1:10,1:20])
x <- rowSums(as.matrix(dtm))
index <- which(x==0)
dtm1 <- dtm[-index, ]
dim(dtm1)
## Precisamos determinar o númerode tópicos nos dados
## como número de cluster em análise de cluster
## nesse exemplo, já temos
## de uma outra análise
#### Topic Modeling - LDA ####
#set.seed(51)
#trainpoints <- sample(1:nrow(dtm1),1*nrow(dtm1),replace=F) # to train on a subsample, change 1.0 to a lower value, say 0.8
k <- length(unique(nyt$Topic.Code))
## função pra extrair termos
SpecificTerms <- function(lda.model,n=1) {
p <- posterior(lda.model)$terms
n <- min(n,ncol(p))
cumulativep <- colSums(p)
specificp <- t(apply(p,1,function(x) ((x) + (x/cumulativep) )))
topterms <- t(apply(specificp, 1, function(x)
(colnames(specificp)[order(x,decreasing=T)[1:n]]) ))
topterms
}
set.seed(4)
lda <- LDA(dtm1, k)
# t termos mais prováveis por tópico
t <- 15
View(terms(lda, t))
## vamos olhar o Topic 17
## Aprov Eleitoral? Bush, Clinton etc.
# t termos com prob acima de minimo
minimo <- .05
terms(lda, t, threshold=minimo)
# tópicos mais prováveis por documentos
head(topics(lda))
## Agora vamos fazer algumas análises dos resultados
# Here I construct a dataframe that scores each document according to
# how closely its content
# matches up with each topic.
# The closer the score is to 0, the more likely its content matches
# up with a particular topic.
## 4 new york e 5 white house
nytTopics <- posterior(lda)$topics
dfnytTopics <- as.data.frame(nytTopics)
dim(dfnytTopics)
head(dfnytTopics)
names(dfnytTopics) <- paste("topic", names(dfnytTopics), sep="")
dfnytTopics$docs <- row.names(dfnytTopics)
head(dfnytTopics)
# qtos docs topico 17 é o maior??
library("compiler")
which.rowMax <- cmpfun (
function(df,n) {
tmp <- numeric(length=n)
for ( i in 1:n) tmp[i] <- which.max(df[i,-ncol(df)])
return(tmp)
}
)
n <- nrow(dfnytTopics)
tmp <- which.rowMax(dfnytTopics[, -n], n )
topic4 <- dfnytTopics[which(tmp==4),]
dfTemp <- as.character(nyt1)
docEscolhido <- dfTemp[which(tmp==4)]
set.seed(2)
docEscolhido[sample(1:length(docEscolhido), 15) ]
terms(lda, t)[,1]