-
Notifications
You must be signed in to change notification settings - Fork 0
/
plotPowerLaw-GIRT.r
132 lines (109 loc) · 4.63 KB
/
plotPowerLaw-GIRT.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# set the working directory
setwd('D:/evaldata/evaltools/')
# setwd('/Users/schaer/Desktop/evaltools/')
# Configure the root directory and the correct CSV file
rootDir <- 'C:/Users/sc/Dropbox/Dissertation/results/girt/facets_topicQuery'
#rootDir <- '/Users/schaer/Dropbox/Dissertation/results/girt/Facetten-Analyse'
csvFile <- 'powerlawForPlot.csv'
# entities <- c('subject')
entities <- c('author',
'classification',
'issn',
'location',
'method',
'publisher',
'pubyear',
'subject')
# load the draw function
# source('drawPowerLaw.r')
# read in the csv file with all PL exponents
powerLawExponents <- read.csv2(paste(rootDir,csvFile,sep='/'),
sep=';',
header=TRUE,
blank.lines.skip=TRUE,
)[,-8] # delete last empty column
for(entity in entities){
currentDir = paste(rootDir,'/',entity,sep='')
for(year in dir(currentDir)){
drawPlot(currentDir,year,entity)
}
}
# draw the power law plots for a given directory, a year and a data type
drawPlot <- function(currentDir,year,type){
#init the PDF export
folder=paste(currentDir,'/',year,sep='')
filename <- paste(currentDir,'/../pdf.merged/',type,year,'.pdf',sep='')
pdf(filename, pointsize=8)
# arrange the layout
par(mfrow=c(5,5),mar=c(1.5,2,1.5,1.5),lwd=0.5,pty='s')
for(file in dir(folder,pattern='*Boost1.csv$')){
# extract the topic number from the filename
topic <- sub(paste('-',type,'rerankTopicQueryFiltersBoost1.csv',sep=''),'',file)
#print status line
print(paste('plotting',topic,'for type',type))
# read in the freq from the single csv files
temptab <- t(read.csv(paste(folder,'/',file,sep=''),
sep=';',
header=FALSE,
blank.lines.skip=TRUE))
freqs <- temptab[1,] # data conversion, just the first row
ranks <- 1:length(freqs)
# plot with on a log-log scale
plot(ranks,freqs,xlab='rank',ylab='frequency',log='xy')
# extract the power law exponent
tempValues <- powerLawExponents[grep(topic,powerLawExponents$topic),]
resultValues <- tempValues[grep(type,tempValues$run),]
# when more than one result, take the first one
resultValue <- resultValues[1,]
alpha <- resultValue$alpha
D <- resultValue$D
xmin <- resultValue$xmin
pval <- resultValue$pval
# draw a dotted line to mark xmin
abline(v=xmin,lty=3)
# check is we really observed a PL
# See Clauset et al (2009) - section 4.2
# print(pval)
if(pval >= 0.1){
# calulate the intersection with y-axis (y_0) and set this
# as the (a) intersect=y_0 and (b) slope=-alpha
# y_xmin <- freqs[xmin]
# y_0 <- y_xmin + (xmin * alpha)
# abline(a=log10(y_0), b=-log10(alpha),lty=2, col='yellow', lwd=2)
# print(paste('intersection method: ', xmin, y_xmin, -alpha, y_0))
# --> this just does not work out!
# very skewed plot... but seems right
# WARNING: NOT WORKING WITH LOG-LOG-PLOT
# x0 <- xmin
# y0 <- freqs[xmin]
# x1 <- max(ranks)
# y1 <- x1^(-log10(alpha))
# segments(c(x0), c(y0), c(x1), c(y1), lty=2, col='red', lwd=2)
# print(paste('segment method: ',x0,y0,x1,y1))
# draw an approximated (wrong!) linear regression model
# abline(lm(log10(freqs)~log10(ranks)),lty=2)
# we only use the data in respect to xmin
filteredRanks <- log10(ranks[xmin:length(ranks)])
filteredFreqs <- log10(freqs[xmin:length(freqs)])
logmodel <- lm(filteredFreqs~filteredRanks)
# print(summary(logmodel))
# print(logmodel$df.residual)
if(logmodel$df.residual > 0){
abline(logmodel, lty=1, col='gray', lwd=1.5)
}
}
# add some decorating text
alpha <- format(alpha,digits=3) # only 3 digits
text(max(ranks), max(freqs),
labels=(paste('top:',topic,'a: -',alpha,'xmin:',xmin)),
adj=1)
}
dev.off() #close file
#compress the pdf file with pdftk
commandLine <- paste('C:/cygwin/bin/pdftk.exe',
filename,
'output',
sub('merged','compressed',filename),
'compress')
system(commandLine)
}