forked from mhawksey/twitter-hashtag-analytics
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdemo.html
698 lines (514 loc) · 384 KB
/
demo.html
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
<!DOCTYPE html>
<!-- saved from url=(0014)about:internet -->
<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<title>Demo of Using twitter-hashtag-analytics to Analyze Tweets</title>
<style type="text/css">
body, td {
font-family: sans-serif;
background-color: white;
font-size: 12px;
margin: 8px;
}
tt, code, pre {
font-family: 'DejaVu Sans Mono', 'Droid Sans Mono', 'Lucida Console', Consolas, Monaco, monospace;
}
h1 {
font-size:2.2em;
}
h2 {
font-size:1.8em;
}
h3 {
font-size:1.4em;
}
h4 {
font-size:1.0em;
}
h5 {
font-size:0.9em;
}
h6 {
font-size:0.8em;
}
a:visited {
color: rgb(50%, 0%, 50%);
}
pre {
margin-top: 0;
max-width: 95%;
border: 1px solid #ccc;
white-space: pre-wrap;
}
pre code {
display: block; padding: 0.5em;
}
code.r, code.cpp {
background-color: #F8F8F8;
}
table, td, th {
border: none;
}
blockquote {
color:#666666;
margin:0;
padding-left: 1em;
border-left: 0.5em #EEE solid;
}
hr {
height: 0px;
border-bottom: none;
border-top-width: thin;
border-top-style: dotted;
border-top-color: #999999;
}
@media print {
* {
background: transparent !important;
color: black !important;
filter:none !important;
-ms-filter: none !important;
}
body {
font-size:12pt;
max-width:100%;
}
a, a:visited {
text-decoration: underline;
}
hr {
visibility: hidden;
page-break-before: always;
}
pre, blockquote {
padding-right: 1em;
page-break-inside: avoid;
}
tr, img {
page-break-inside: avoid;
}
img {
max-width: 100% !important;
}
@page :left {
margin: 15mm 20mm 15mm 10mm;
}
@page :right {
margin: 15mm 10mm 15mm 20mm;
}
p, h2, h3 {
orphans: 3; widows: 3;
}
h2, h3 {
page-break-after: avoid;
}
}
</style>
<!-- Styles for R syntax highlighter -->
<style type="text/css">
pre .operator,
pre .paren {
color: rgb(104, 118, 135)
}
pre .literal {
color: rgb(88, 72, 246)
}
pre .number {
color: rgb(0, 0, 205);
}
pre .comment {
color: rgb(76, 136, 107);
}
pre .keyword {
color: rgb(0, 0, 255);
}
pre .identifier {
color: rgb(0, 0, 0);
}
pre .string {
color: rgb(3, 106, 7);
}
</style>
<!-- R syntax highlighter -->
<script type="text/javascript">
var hljs=new function(){function m(p){return p.replace(/&/gm,"&").replace(/</gm,"<")}function f(r,q,p){return RegExp(q,"m"+(r.cI?"i":"")+(p?"g":""))}function b(r){for(var p=0;p<r.childNodes.length;p++){var q=r.childNodes[p];if(q.nodeName=="CODE"){return q}if(!(q.nodeType==3&&q.nodeValue.match(/\s+/))){break}}}function h(t,s){var p="";for(var r=0;r<t.childNodes.length;r++){if(t.childNodes[r].nodeType==3){var q=t.childNodes[r].nodeValue;if(s){q=q.replace(/\n/g,"")}p+=q}else{if(t.childNodes[r].nodeName=="BR"){p+="\n"}else{p+=h(t.childNodes[r])}}}if(/MSIE [678]/.test(navigator.userAgent)){p=p.replace(/\r/g,"\n")}return p}function a(s){var r=s.className.split(/\s+/);r=r.concat(s.parentNode.className.split(/\s+/));for(var q=0;q<r.length;q++){var p=r[q].replace(/^language-/,"");if(e[p]){return p}}}function c(q){var p=[];(function(s,t){for(var r=0;r<s.childNodes.length;r++){if(s.childNodes[r].nodeType==3){t+=s.childNodes[r].nodeValue.length}else{if(s.childNodes[r].nodeName=="BR"){t+=1}else{if(s.childNodes[r].nodeType==1){p.push({event:"start",offset:t,node:s.childNodes[r]});t=arguments.callee(s.childNodes[r],t);p.push({event:"stop",offset:t,node:s.childNodes[r]})}}}}return t})(q,0);return p}function k(y,w,x){var q=0;var z="";var s=[];function u(){if(y.length&&w.length){if(y[0].offset!=w[0].offset){return(y[0].offset<w[0].offset)?y:w}else{return w[0].event=="start"?y:w}}else{return y.length?y:w}}function t(D){var A="<"+D.nodeName.toLowerCase();for(var B=0;B<D.attributes.length;B++){var C=D.attributes[B];A+=" "+C.nodeName.toLowerCase();if(C.value!==undefined&&C.value!==false&&C.value!==null){A+='="'+m(C.value)+'"'}}return A+">"}while(y.length||w.length){var v=u().splice(0,1)[0];z+=m(x.substr(q,v.offset-q));q=v.offset;if(v.event=="start"){z+=t(v.node);s.push(v.node)}else{if(v.event=="stop"){var p,r=s.length;do{r--;p=s[r];z+=("</"+p.nodeName.toLowerCase()+">")}while(p!=v.node);s.splice(r,1);while(r<s.length){z+=t(s[r]);r++}}}}return z+m(x.substr(q))}function j(){function q(x,y,v){if(x.compiled){return}var u;var s=[];if(x.k){x.lR=f(y,x.l||hljs.IR,true);for(var w in x.k){if(!x.k.hasOwnProperty(w)){continue}if(x.k[w] instanceof Object){u=x.k[w]}else{u=x.k;w="keyword"}for(var r in u){if(!u.hasOwnProperty(r)){continue}x.k[r]=[w,u[r]];s.push(r)}}}if(!v){if(x.bWK){x.b="\\b("+s.join("|")+")\\s"}x.bR=f(y,x.b?x.b:"\\B|\\b");if(!x.e&&!x.eW){x.e="\\B|\\b"}if(x.e){x.eR=f(y,x.e)}}if(x.i){x.iR=f(y,x.i)}if(x.r===undefined){x.r=1}if(!x.c){x.c=[]}x.compiled=true;for(var t=0;t<x.c.length;t++){if(x.c[t]=="self"){x.c[t]=x}q(x.c[t],y,false)}if(x.starts){q(x.starts,y,false)}}for(var p in e){if(!e.hasOwnProperty(p)){continue}q(e[p].dM,e[p],true)}}function d(B,C){if(!j.called){j();j.called=true}function q(r,M){for(var L=0;L<M.c.length;L++){if((M.c[L].bR.exec(r)||[null])[0]==r){return M.c[L]}}}function v(L,r){if(D[L].e&&D[L].eR.test(r)){return 1}if(D[L].eW){var M=v(L-1,r);return M?M+1:0}return 0}function w(r,L){return L.i&&L.iR.test(r)}function K(N,O){var M=[];for(var L=0;L<N.c.length;L++){M.push(N.c[L].b)}var r=D.length-1;do{if(D[r].e){M.push(D[r].e)}r--}while(D[r+1].eW);if(N.i){M.push(N.i)}return f(O,M.join("|"),true)}function p(M,L){var N=D[D.length-1];if(!N.t){N.t=K(N,E)}N.t.lastIndex=L;var r=N.t.exec(M);return r?[M.substr(L,r.index-L),r[0],false]:[M.substr(L),"",true]}function z(N,r){var L=E.cI?r[0].toLowerCase():r[0];var M=N.k[L];if(M&&M instanceof Array){return M}return false}function F(L,P){L=m(L);if(!P.k){return L}var r="";var O=0;P.lR.lastIndex=0;var M=P.lR.exec(L);while(M){r+=L.substr(O,M.index-O);var N=z(P,M);if(N){x+=N[1];r+='<span class="'+N[0]+'">'+M[0]+"</span>"}else{r+=M[0]}O=P.lR.lastIndex;M=P.lR.exec(L)}return r+L.substr(O,L.length-O)}function J(L,M){if(M.sL&&e[M.sL]){var r=d(M.sL,L);x+=r.keyword_count;return r.value}else{return F(L,M)}}function I(M,r){var L=M.cN?'<span class="'+M.cN+'">':"";if(M.rB){y+=L;M.buffer=""}else{if(M.eB){y+=m(r)+L;M.buffer=""}else{y+=L;M.buffer=r}}D.push(M);A+=M.r}function G(N,M,Q){var R=D[D.length-1];if(Q){y+=J(R.buffer+N,R);return false}var P=q(M,R);if(P){y+=J(R.buffer+N,R);I(P,M);return P.rB}var L=v(D.length-1,M);if(L){var O=R.cN?"</span>":"";if(R.rE){y+=J(R.buffer+N,R)+O}else{if(R.eE){y+=J(R.buffer+N,R)+O+m(M)}else{y+=J(R.buffer+N+M,R)+O}}while(L>1){O=D[D.length-2].cN?"</span>":"";y+=O;L--;D.length--}var r=D[D.length-1];D.length--;D[D.length-1].buffer="";if(r.starts){I(r.starts,"")}return R.rE}if(w(M,R)){throw"Illegal"}}var E=e[B];var D=[E.dM];var A=0;var x=0;var y="";try{var s,u=0;E.dM.buffer="";do{s=p(C,u);var t=G(s[0],s[1],s[2]);u+=s[0].length;if(!t){u+=s[1].length}}while(!s[2]);if(D.length>1){throw"Illegal"}return{r:A,keyword_count:x,value:y}}catch(H){if(H=="Illegal"){return{r:0,keyword_count:0,value:m(C)}}else{throw H}}}function g(t){var p={keyword_count:0,r:0,value:m(t)};var r=p;for(var q in e){if(!e.hasOwnProperty(q)){continue}var s=d(q,t);s.language=q;if(s.keyword_count+s.r>r.keyword_count+r.r){r=s}if(s.keyword_count+s.r>p.keyword_count+p.r){r=p;p=s}}if(r.language){p.second_best=r}return p}function i(r,q,p){if(q){r=r.replace(/^((<[^>]+>|\t)+)/gm,function(t,w,v,u){return w.replace(/\t/g,q)})}if(p){r=r.replace(/\n/g,"<br>")}return r}function n(t,w,r){var x=h(t,r);var v=a(t);var y,s;if(v){y=d(v,x)}else{return}var q=c(t);if(q.length){s=document.createElement("pre");s.innerHTML=y.value;y.value=k(q,c(s),x)}y.value=i(y.value,w,r);var u=t.className;if(!u.match("(\\s|^)(language-)?"+v+"(\\s|$)")){u=u?(u+" "+v):v}if(/MSIE [678]/.test(navigator.userAgent)&&t.tagName=="CODE"&&t.parentNode.tagName=="PRE"){s=t.parentNode;var p=document.createElement("div");p.innerHTML="<pre><code>"+y.value+"</code></pre>";t=p.firstChild.firstChild;p.firstChild.cN=s.cN;s.parentNode.replaceChild(p.firstChild,s)}else{t.innerHTML=y.value}t.className=u;t.result={language:v,kw:y.keyword_count,re:y.r};if(y.second_best){t.second_best={language:y.second_best.language,kw:y.second_best.keyword_count,re:y.second_best.r}}}function o(){if(o.called){return}o.called=true;var r=document.getElementsByTagName("pre");for(var p=0;p<r.length;p++){var q=b(r[p]);if(q){n(q,hljs.tabReplace)}}}function l(){if(window.addEventListener){window.addEventListener("DOMContentLoaded",o,false);window.addEventListener("load",o,false)}else{if(window.attachEvent){window.attachEvent("onload",o)}else{window.onload=o}}}var e={};this.LANGUAGES=e;this.highlight=d;this.highlightAuto=g;this.fixMarkup=i;this.highlightBlock=n;this.initHighlighting=o;this.initHighlightingOnLoad=l;this.IR="[a-zA-Z][a-zA-Z0-9_]*";this.UIR="[a-zA-Z_][a-zA-Z0-9_]*";this.NR="\\b\\d+(\\.\\d+)?";this.CNR="\\b(0[xX][a-fA-F0-9]+|(\\d+(\\.\\d*)?|\\.\\d+)([eE][-+]?\\d+)?)";this.BNR="\\b(0b[01]+)";this.RSR="!|!=|!==|%|%=|&|&&|&=|\\*|\\*=|\\+|\\+=|,|\\.|-|-=|/|/=|:|;|<|<<|<<=|<=|=|==|===|>|>=|>>|>>=|>>>|>>>=|\\?|\\[|\\{|\\(|\\^|\\^=|\\||\\|=|\\|\\||~";this.ER="(?![\\s\\S])";this.BE={b:"\\\\.",r:0};this.ASM={cN:"string",b:"'",e:"'",i:"\\n",c:[this.BE],r:0};this.QSM={cN:"string",b:'"',e:'"',i:"\\n",c:[this.BE],r:0};this.CLCM={cN:"comment",b:"//",e:"$"};this.CBLCLM={cN:"comment",b:"/\\*",e:"\\*/"};this.HCM={cN:"comment",b:"#",e:"$"};this.NM={cN:"number",b:this.NR,r:0};this.CNM={cN:"number",b:this.CNR,r:0};this.BNM={cN:"number",b:this.BNR,r:0};this.inherit=function(r,s){var p={};for(var q in r){p[q]=r[q]}if(s){for(var q in s){p[q]=s[q]}}return p}}();hljs.LANGUAGES.cpp=function(){var a={keyword:{"false":1,"int":1,"float":1,"while":1,"private":1,"char":1,"catch":1,"export":1,virtual:1,operator:2,sizeof:2,dynamic_cast:2,typedef:2,const_cast:2,"const":1,struct:1,"for":1,static_cast:2,union:1,namespace:1,unsigned:1,"long":1,"throw":1,"volatile":2,"static":1,"protected":1,bool:1,template:1,mutable:1,"if":1,"public":1,friend:2,"do":1,"return":1,"goto":1,auto:1,"void":2,"enum":1,"else":1,"break":1,"new":1,extern:1,using:1,"true":1,"class":1,asm:1,"case":1,typeid:1,"short":1,reinterpret_cast:2,"default":1,"double":1,register:1,explicit:1,signed:1,typename:1,"try":1,"this":1,"switch":1,"continue":1,wchar_t:1,inline:1,"delete":1,alignof:1,char16_t:1,char32_t:1,constexpr:1,decltype:1,noexcept:1,nullptr:1,static_assert:1,thread_local:1,restrict:1,_Bool:1,complex:1},built_in:{std:1,string:1,cin:1,cout:1,cerr:1,clog:1,stringstream:1,istringstream:1,ostringstream:1,auto_ptr:1,deque:1,list:1,queue:1,stack:1,vector:1,map:1,set:1,bitset:1,multiset:1,multimap:1,unordered_set:1,unordered_map:1,unordered_multiset:1,unordered_multimap:1,array:1,shared_ptr:1}};return{dM:{k:a,i:"</",c:[hljs.CLCM,hljs.CBLCLM,hljs.QSM,{cN:"string",b:"'\\\\?.",e:"'",i:"."},{cN:"number",b:"\\b(\\d+(\\.\\d*)?|\\.\\d+)(u|U|l|L|ul|UL|f|F)"},hljs.CNM,{cN:"preprocessor",b:"#",e:"$"},{cN:"stl_container",b:"\\b(deque|list|queue|stack|vector|map|set|bitset|multiset|multimap|unordered_map|unordered_set|unordered_multiset|unordered_multimap|array)\\s*<",e:">",k:a,r:10,c:["self"]}]}}}();hljs.LANGUAGES.r={dM:{c:[hljs.HCM,{cN:"number",b:"\\b0[xX][0-9a-fA-F]+[Li]?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+(?:[eE][+\\-]?\\d*)?L\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\b\\d+\\.(?!\\d)(?:i\\b)?",e:hljs.IMMEDIATE_RE,r:1},{cN:"number",b:"\\b\\d+(?:\\.\\d*)?(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"number",b:"\\.\\d+(?:[eE][+\\-]?\\d*)?i?\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"keyword",b:"(?:tryCatch|library|setGeneric|setGroupGeneric)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\.",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\.\\.\\d+(?![\\w.])",e:hljs.IMMEDIATE_RE,r:10},{cN:"keyword",b:"\\b(?:function)",e:hljs.IMMEDIATE_RE,r:2},{cN:"keyword",b:"(?:if|in|break|next|repeat|else|for|return|switch|while|try|stop|warning|require|attach|detach|source|setMethod|setClass)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"literal",b:"(?:NA|NA_integer_|NA_real_|NA_character_|NA_complex_)\\b",e:hljs.IMMEDIATE_RE,r:10},{cN:"literal",b:"(?:NULL|TRUE|FALSE|T|F|Inf|NaN)\\b",e:hljs.IMMEDIATE_RE,r:1},{cN:"identifier",b:"[a-zA-Z.][a-zA-Z0-9._]*\\b",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"<\\-(?!\\s*\\d)",e:hljs.IMMEDIATE_RE,r:2},{cN:"operator",b:"\\->|<\\-",e:hljs.IMMEDIATE_RE,r:1},{cN:"operator",b:"%%|~",e:hljs.IMMEDIATE_RE},{cN:"operator",b:">=|<=|==|!=|\\|\\||&&|=|\\+|\\-|\\*|/|\\^|>|<|!|&|\\||\\$|:",e:hljs.IMMEDIATE_RE,r:0},{cN:"operator",b:"%",e:"%",i:"\\n",r:1},{cN:"identifier",b:"`",e:"`",r:0},{cN:"string",b:'"',e:'"',c:[hljs.BE],r:0},{cN:"string",b:"'",e:"'",c:[hljs.BE],r:0},{cN:"paren",b:"[[({\\])}]",e:hljs.IMMEDIATE_RE,r:0}]}};
hljs.initHighlightingOnLoad();
</script>
</head>
<body>
<h1>Demo of Using twitter-hashtag-analytics to Analyze Tweets</h1>
<p>Building on <a href="https://github.com/benmarwick/AAA2011-Tweets">Ben Marwick</a>, <a href="http://mashe.hawksey.info/2012/01/tags-r/">Martin Hawksey</a> and <a href="http://blog.ouseful.info/2012/01/21/a-quick-view-over-a-mashe-google-spreadsheet-twitter-archive-of-ukgc2012-tweets/">Tony Hirst</a>'s work on analyzing tweets with R, I started an R project for tweet analysis, namely <a href="https://github.com/dirkchen/twitter-hashtag-analytics">twitter-hashtag-analytics</a>. This project is hosted on Github and welcomes anyone who's interested to contribute. It is my very first attempt to write a package in R, so I admit the capabilities of it is still limited and its structure may be not properly planned. Any advice will be highly appreciated.</p>
<p>This demo, drafted with <a href="http://yihui.name/knitr/">knitr</a>, aims to show the functionality of <a href="https://github.com/dirkchen/twitter-hashtag-analytics">twitter-hashtag-analytics</a> and also available on Github. It will evlove along with this project</p>
<h2>Data Preparation</h2>
<p>Before starting to analyze tweets, we will first load a few source files (libraries) in this project.</p>
<pre><code class="r"># check working directory
getwd()
# note that Knitr automatically sets wd to where the Rmd file is. so if
# you wish to run code line-by-line, you should setwd mannually.
# setwd('/home/bodong/src/r/twitter-analytics/twitter-hashtag-analytics')
# load source files
source("get_tweets.R")
source("munge_tweets.R")
source("utilities.R")
</code></pre>
<p>Then we can retrieve a Twitter hashtag dataset by searching through Twitter API. Two other methods of retriving tweets implemented in this project so far include <strong>retriving from Google Spreadsheet archives</strong> (see <a href="http://mashe.hawksey.info/2013/02/twitter-archive-tagsv5/">here</a>) and <strong>reading directly from a CSV file</strong>.</p>
<pre><code class="r"># get tweets by search
# this function is defined in get_tweets.R
df <- GetTweetsBySearch('#LAK13')
# save or load data (so you can reuse data rather than search all the time)
save(df, file="./data/df.Rda")
# load("./data/df.Rda")
</code></pre>
<p>This dataset contains 101 tweets posted by 49 unique Twitter users between 2013-02-20 and 2013-02-26.</p>
<p>Because tweet information retrieved through twitteR is kind of limited (see its <a href="http://cran.r-project.org/web/packages/twitteR/index.html">reference manual</a>, p. 11), we need to extract user information, such as <code>reply_to_user</code> and <code>retweet_from_user</code>, mannually from each tweet. At the same time, the names of metadata in twitteR are quite different from those used in the official Twitter API, the following <code>PreprocessTweets</code> function in <code>munge_tweets.R</code> also renames some attributes of tweets. Moreover, the <code>PreprocessTweets</code> function also trims urls in tweets and put them in a new column named <code>links</code>.</p>
<pre><code class="r"># preprocessing
df <- PreprocessTweets(df)
# structure of df
str(df)
</code></pre>
<pre><code>## 'data.frame': 101 obs. of 14 variables:
## $ text : chr "Professor in Learning Analytics - Milton Keynes - OPEN UNIVERSITY #jobs Anyone interested from #LAK13" "RT @sbskmi: #LearningAnalytics tutorials + practicals in #lak13 open course: Tableau, R + Evidence Hub " "Demo of Using twitter-hashtag-analytics Package to Analyze Tweets from #LAK13 " "Introducing Drake, a kind of 'make for data' from @factual: #lak13" ...
## $ favorited : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSN : logi NA NA NA NA NA NA ...
## $ created_at : POSIXct, format: "2013-02-26 20:32:29" "2013-02-26 20:22:12" ...
## $ truncated : logi FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ replyToSID : chr NA NA NA NA ...
## $ id : chr "306502014211354624" "306499425751162880" "306492081793282048" "306487793666891776" ...
## $ replyToUID : logi NA NA NA NA NA NA ...
## $ statusSource: chr "&lt;a href=&quot;http://twitter.com/tweetbutton&quot;&gt;Tweet Button&lt;/a&gt;" "&lt;a href=&quot;http://www.tweetdeck.com&quot;&gt;TweetDeck&lt;/a&gt;" "&lt;a href=&quot;http://publicize.wp.com/&quot;&gt;WordPress.com&lt;/a&gt;" "&lt;a href=&quot;http://www.tweetdeck.com&quot;&gt;TweetDeck&lt;/a&gt;" ...
## $ screen_name : chr "EleniZazani" "sjgknight" "bodongchen" "cteplovs" ...
## $ from_user : chr "EleniZazani" "sjgknight" "bodongchen" "cteplovs" ...
## $ reply_to : chr NA NA NA NA ...
## $ retweet_from: chr NA "sbskmi" NA NA ...
## $ links : chr "http://t.co/GfIVItBc74" "http://t.co/baQ8yNlZqV" "http://t.co/DObfNWwmcs" "http://t.co/H6kJET9w2t" ...
</code></pre>
<h2>Start from Easy Stuff: Count Things</h2>
<h3>Count tweets, retweets (by), and replies (to) for each user</h3>
<p>Regular statuses, retweets, and replies are three main types of tweets we analyze. The <code>GetTweetCountTable</code> function can easily count total tweets sent by a user, times of retweeting by other users, and number of replies a user has received.</p>
<pre><code class="r">require(ggplot2)
require(reshape2)
# Count tables
countTweets <- GetTweetCountTable(df, "from_user")
countRetweets <- GetTweetCountTable(df, "retweet_from")
countReplies <- GetTweetCountTable(df, "reply_to")
# quickly check distribution of tweets per user
qplot(countTweets$count, binwidth = 1, xlab = "Number of Tweets")
</code></pre>
<p><img src="" alt="plot of chunk counttables"/> </p>
<pre><code class="r">
# combine counts into one data frame
counts <- merge(countTweets, countRetweets, by = "user", all.x = TRUE)
counts <- merge(counts, countReplies, by = "user", all.x = TRUE)
colnames(counts) <- c("user", "tweets", "replied_to", "retweeted_by")
counts[is.na(counts)] <- 0
# melt data
counts.melt <- melt(counts, id.vars = c("user"))
# plot (Cleveland dot plot)
ggplot(counts.melt, aes(x = user, y = value, color = variable)) + geom_point() +
coord_flip() + ggtitle("Counts of tweets, retweets, and messages") + xlab("Counts") +
ylab("Users")
</code></pre>
<p><img src="" alt="plot of chunk counttables"/> </p>
<h3>Ratio of retweets to tweets</h3>
<p>To get a sense how received or valued one's tweets were within the community, we can further count the ratio of being retweeted by other users to sent tweets.</p>
<pre><code class="r"># create new column 'ratio'
counts$ratio <- counts$retweeted_by/counts$tweets
# plot ratio for users who have at least one rt
ggplot(counts[counts$retweeted_by > 0, ], aes(x = reorder(user, ratio), y = ratio)) +
geom_point() + coord_flip() + ggtitle("Ratio of retweets to tweets") + xlab("Users") +
ylab("Retweets/Tweets ratio")
</code></pre>
<p><img src="" alt="plot of chunk ratio"/> </p>
<h3>Count URLs</h3>
<p>URLs embedded in tweets are important because they usually link to important resources that are of interest to this community.</p>
<pre><code class="r"># count links
countLinks <- GetTweetCountTable(df, "links")
names(countLinks)[1] <- "url"
# check top links
head(countLinks[with(countLinks, order(-count)), ])
</code></pre>
<pre><code>## url count
## 1 https://t.co/1hj1FrD8 11
## 2 https://t.co/8Di8QcKz 7
## 3 https://t.co/EgtjcKoU6a 6
## 4 https://t.co/jscpxQpfNA 4
## 5 https://t.co/LnZsOCNFNs 3
## 6 https://t.co/rD8rtO05XV 3
</code></pre>
<pre><code class="r">
# plot to see distribution of links
ggplot(countLinks[countLinks$count > 1, ], aes(reorder(url, count), count)) +
geom_point() + coord_flip() + xlab("URL") + ylab("Number of messages containing the URL")
</code></pre>
<p><img src="" alt="plot of chunk counturls"/> </p>
<h2>Social Network Analysis (SNA)</h2>
<h3>Visualize social networks</h3>
<p>An archived tweet dataset contains <code>retweeting</code> and <code>replying</code> as two main type of links among users. Some studies looks into <code>following</code> relations, which require further queries to Twitter. So in this demo, we focus on <code>retweeting</code> and <code>replying</code> links.</p>
<p>The <code>CreateSNADataFrame</code> function in <code>social_analysis.R</code> provides an easy way to create a data frame containing all edges of the requested social network. With created edges, we can easily create an SNA graph and visualize it with packages like <code>igraph</code> and <code>sna</code>.</p>
<pre><code class="r"># load source file first
source("social_analysis.R")
# create data frame
rt.df <- CreateSNADataFrame(df, from = "from_user", to = "retweet_from", linkNames = "rt")
rp.df <- CreateSNADataFrame(df, from = "from_user", to = "reply_to", linkNames = "rp")
# begin social network analysis plotting
require(igraph)
require(sna)
require(Matrix)
require(SparseM)
# create graph data frame (igraph)
g <- graph.data.frame(rt.df, directed = TRUE)
# plot with igraph (quick and dirty)
plot.igraph(g)
</code></pre>
<p><img src="" alt="plot of chunk sna"/> </p>
<pre><code class="r">
# plot with sna get adjacency matrix
mat <- get.adjacency(g)
# convert to csr matrix provided by SparseM ref:
# http://cos.name/cn/topic/108758
mat.csr <- as.matrix.csr(mat, ncol = ncol(mat))
# plot with sna
gplot(mat.csr)
</code></pre>
<p><img src="" alt="plot of chunk sna"/> </p>
<h3>Basic SNA measures</h3>
<p>We can further compute some basic SNA measures. For instance, density of this network is 0.0312, reciprocity of users in the network is 0.9412, and degree centralization of this network is 0.2405. These measures are calculated as below.</p>
<pre><code class="r"># density
gden(mat.csr)
</code></pre>
<pre><code>## [1] 0.03119
</code></pre>
<pre><code class="r">
# reciprocity
grecip(mat.csr)
</code></pre>
<pre><code>## Mut
## 0.9412
</code></pre>
<pre><code class="r">
# centralization
centralization(mat.csr, sna::degree)
</code></pre>
<pre><code>## [1] 0.2405
</code></pre>
<h3>Community detection</h3>
<p>A regular task in SNA is to identify communities in a network. We can do it through the <code>walktrap.community</code> function in <code>igraph</code> package.</p>
<pre><code class="r">g.wc <- walktrap.community(g, steps = 1000, modularity = TRUE)
# number of communities
length(g.wc)
</code></pre>
<pre><code>## [1] 4
</code></pre>
<pre><code class="r"># sizes of communities
sizes(g.wc)
</code></pre>
<pre><code>## Community sizes
## 1 2 3 4
## 5 25 2 2
</code></pre>
<pre><code class="r"># plot
plot(as.dendrogram(g.wc))
</code></pre>
<p><img src="" alt="plot of chunk detectcommunity"/> </p>
<p>We have detected 4 communities in this network. The largest community contains 51.02% of all users in this dataset.</p>
<h3>Univariate Conditional Uniform Graph Tests</h3>
<p>In network analysis, people do types of tests to check whether some aspects of a network are <em>unusual</em>. We can do such tests, namely <em>conditional uniform graph tests</em>, through the <code>cug.test</code> function in the <code>sna</code> package. Further information about these tests can be found <a href="http://artax.karlin.mff.cuni.cz/r-help/library/sna/html/cug.test.html">here</a>.</p>
<pre><code class="r"># density
cug.gden <- cug.test(mat.csr, gden)
plot(cug.gden)
</code></pre>
<p><img src="" alt="plot of chunk cug"/> </p>
<pre><code class="r">range(cug.gden$rep.stat)
</code></pre>
<pre><code>## [1] 0.4510 0.5419
</code></pre>
<pre><code class="r">
# reciprocity
cug.recip <- cug.test(mat.csr, grecip)
plot(cug.recip)
</code></pre>
<p><img src="" alt="plot of chunk cug"/> </p>
<pre><code class="r">range(cug.recip$rep.stat)
</code></pre>
<pre><code>## [1] 0.4332 0.5615
</code></pre>
<pre><code class="r">
# transistivity
cug.gtrans <- cug.test(mat.csr, gtrans)
plot(cug.gtrans)
</code></pre>
<p><img src="" alt="plot of chunk cug"/> </p>
<pre><code class="r">range(cug.gtrans$rep.stat)
</code></pre>
<pre><code>## [1] 0.4485 0.5477
</code></pre>
<pre><code class="r">
# centralisation
cug.cent <- cug.test(mat.csr, centralization, FUN.arg = list(FUN = degree))
plot(cug.cent)
</code></pre>
<p><img src="" alt="plot of chunk cug"/> </p>
<pre><code class="r">range(cug.cent$rep.stat)
</code></pre>
<pre><code>## [1] 0.0625 0.2585
</code></pre>
<h2>Semantic Analysis</h2>
<h3>Words</h3>
<p>Firstly, make a word cloud.</p>
<pre><code class="r"># load source file first
source("semantic_analysis.R")
# construct corpus, with regular preprocessing performed
corpus <- ConstructCorpus(df$text, removeTags = TRUE, removeUsers = TRUE)
# make a word cloud
MakeWordCloud(corpus)
</code></pre>
<p><img src="" alt="plot of chunk wordcloud"/> </p>
<p>This task first uses <code>ConstructCorpus</code> in <code>semantic_analysis.R</code> to create a text corpus, and then uses <code>MakeWordCloud</code> to make a word cloud. Please note that <code>ConstructCorpus</code> provides a number of options such as whether to remove hashtags (#tag) or users (@user) embedded in tweets.</p>
<p>Next we are going to create a term-document matrix for some quick similarity computation.</p>
<pre><code class="r"># create a term document matrix only keep tokens longer than three
# characters
td.mat <- TermDocumentMatrix(corpus, control = list(minWordLength = 3))
# have a quick look
td.mat
</code></pre>
<pre><code>## A term-document matrix (272 terms, 101 documents)
##
## Non-/sparse entries: 704/26768
## Sparsity : 97%
## Maximal term length: 23
## Weighting : term frequency (tf)
</code></pre>
<pre><code class="r">
# frequent words
findFreqTerms(td.mat, lowfreq = 10)
</code></pre>
<pre><code>## [1] "activity" "analytics" "analyzing" "canvas" "capturing"
## [6] "course" "data" "discussion" "evidence" "feedback"
## [11] "fritz" "john" "join" "learning" "min"
## [16] "network" "peer" "recipes" "scale" "tools"
## [21] "using"
</code></pre>
<pre><code class="r">
# find related words of a word
findAssocs(td.mat, "learning", 0.5)
</code></pre>
<pre><code>## min analytics feedback fritz peer scale join
## 0.78 0.73 0.64 0.64 0.64 0.64 0.60
## using
## 0.54
</code></pre>
<p>For more advanced similarity computation among documents and terms, I am considering adding Latent Semantic Analysis (LSA) capability into this package in the future.</p>
<h3>Topic modelling with Latent Dirichlet Allocation (LDA)</h3>
<p>With the sparse term-document matrix created above, we can use the <code>TrainLDAModel</code> function in <code>semantic_analysis.R</code> to train a LDA model. (Note: I don't understand all of steps in the code in <code>TrainLDAModel</code> refactored from <a href="https://github.com/benmarwick/AAA2011-Tweets">Ben Marwick's repo</a>. So please help to check it if you understand LDA.) This step may take a while depending on the size of the dataset.</p>
<pre><code class="r"># timing start
ptm <- proc.time()
# generate a LDA model
lda <- TrainLDAModel(td.mat)
# time used
proc.time() - ptm
</code></pre>
<pre><code>## user system elapsed
## 122.000 0.012 122.561
</code></pre>
<p>ThiS LDA model contains 24 topics. We can check keywords in each topic, get relevant topics of each tweet, and compute similarity scores among tweets based on topics they are related to.</p>
<pre><code class="r"># get keywords for each topic
lda_terms <- get_terms(lda, 5)
# look at the first 5 topics
lda_terms[, 1:5]
</code></pre>
<pre><code>## Topic 1 Topic 2 Topic 3 Topic 4 Topic 5
## [1,] "contribution" "session" "john" "assignment" "data"
## [2,] "dont" "data" "research" "data" "analysis"
## [3,] "evidence" "education" "session" "expensive" "pandas"
## [4,] "maybe" "matters" "available" "human" "using"
## [5,] "meaningful" "owns" "python" "mining" "available"
</code></pre>
<pre><code class="r">
# gets topic numbers per document
lda_topics <- get_topics(lda, 5)
# look at the first 10 documents
lda_topics[, 1:10]
</code></pre>
<pre><code>## 1 2 3 4 5 6 7 8 9 10
## [1,] 20 16 14 11 5 2 16 16 16 8
## [2,] 24 1 5 17 1 10 1 1 2 2
## [3,] 19 2 1 22 3 1 2 2 3 5
## [4,] 1 3 2 1 6 3 3 3 5 4
## [5,] 2 5 3 2 7 4 5 5 6 1
</code></pre>
<pre><code class="r">
# compute similarity between two documents
CosineSimilarity(lda_topics[, 1], lda_topics[, 10])
</code></pre>
<pre><code>## [,1]
## [1,] 0.8042
</code></pre>
<pre><code class="r">
# computer a similarity matrix of documents
sim.mat <- sapply(1:ncol(lda_topics), function(i) {
sapply(1:ncol(lda_topics), function(j) CosineSimilarity(lda_topics[, i],
lda_topics[, j]))
})
# find most relevant tweets for a tweet
index <- 1
ids <- which(sim.mat[, index] > quantile(sim.mat[, index], 0.9))
sim.doc.df <- data.frame(id = ids, sim = sim.mat[, index][ids])
sim.doc.df <- sim.doc.df[with(sim.doc.df, order(-sim)), ]
# indices of most relevant tweets
head(sim.doc.df$id)
</code></pre>
<pre><code>## [1] 1 63 73 4 50 58
</code></pre>
<h3>Sentiment Analysis</h3>
<p>This project implements three methods (with one method that depends on <em>ViralHeat</em> not working) of analyzing sentiment of tweets. Let's try function <code>ScoreSentiment</code> in <code>sentiment_analysis.R</code> implemented based on <a href="http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/">this post</a>.</p>
<pre><code class="r"># compute sentiment scores for all tweets
scores <- ScoreSentiment(df$text, .progress = "text")
# plot scores
ggplot(scores, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score") +
ylab("Frequency") + ggtitle("Sentiment Analysis of Tweets")
</code></pre>
<p><img src="" alt="plot of chunk sentiment"/> </p>
<pre><code class="r">
scores <- scores[with(scores, order(-score)), ]
# check happy tweets
as.character(head(scores$text, 3))
# check unhappy tweets
as.character(tail(scores$text, 3))
# check sentiment scores of tweets containing certain words create subset
# based on tweets with certain words, e.g., learning
scores.sub <- subset(scores, regexpr("learning", scores$text) > 0)
# plot histogram for this token
ggplot(scores.sub, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score for the token 'learning'") +
ylab("Frequency")
</code></pre>
<p><img src="" alt="plot of chunk sentiment"/> </p>
<p>Sentiment analysis with the <code>sentiment</code> package.</p>
<pre><code class="r">scores2 <- ScoreSentiment2(df$text)
# plot scores. scale_x_log10 is used because the score is based on log
# likelihood
ggplot(scores2, aes(x = score)) + geom_histogram() + xlab("Sentiment score") +
ylab("Frequency") + ggtitle("Sentiment Analysis of Tweets") + scale_x_log10()
</code></pre>
<p><img src="" alt="plot of chunk sentiment2"/> </p>
<pre><code class="r">
# plot emotion
qplot(scores2$emotion)
</code></pre>
<p><img src="" alt="plot of chunk sentiment2"/> </p>
<pre><code class="r">
# plot most likely sentiment category
qplot(scores2$best_fit)
</code></pre>
<p><img src="" alt="plot of chunk sentiment2"/> </p>
<p>We can further check whether these two scores are correlated.</p>
<pre><code class="r"># put them into one data frame
scores3 <- data.frame(score1 = scores$score, score2 = scores2$score)
# scatterplot with regression line
ggplot(scores3, aes(x = score1, y = score2)) + geom_point() + stat_smooth(method = "lm") +
xlab("Score by counting words") + ylab("Score from sentiment package")
</code></pre>
<p><img src="" alt="plot of chunk sentimentcompare"/> </p>
<p>Finally, this project is at its early stage. If you are interested, please fork <a href="https://github.com/dirkchen/twitter-hashtag-analytics">twitter-hashtag-analytics</a> on Github.</p>
</body>
</html>