-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathZoteroRnalysis.R
1655 lines (1048 loc) · 53.8 KB
/
ZoteroRnalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#---
#ZoteroRnalysis, version 1.13
#---
## TODOLIST : commenting the code and changing the name of the variables for better readability
### METADATA
# Author : Pascal Martinolli
# Date (version 1.0) : 2023-12-01
# Last version of the code available at https://github.com/pmartinolli/ZoteroRnalysis/
# GPL-3.0 license https://github.com/pmartinolli/ZoteroRnalysis/blob/main/LICENSE
# This project and an example about TTRPGs is bloggued and discussed at https://jdr.hypotheses.org/1907 (in French)
### WHAT IS IT DOING ?
# This R code can analyze a Zotero library of references & can produce graphics and tabular statistics
# Optionally, it can retrieve information from Wikidata to enrich the original information
### WHY ?
# 1. To learn R Studio with a fun, useful and easy practice
# 2. To better understand your library of references, for example :
# What is the distribution of the year of publication ? Did the publications happened long after the journal were created of is it a new academic outlet ?
# What are the main journals of the articles ? It can give an idea where to publish later
# What are the main publishers of the books and book sections ?
# What are the main languages of the references ?
# What are the main authors of the studies ?
# Are authors single authors or multiples authors ?
# What are the main topics of the studies ? (NB: you will need to have indexed your corpus of references with your own thesaurus)
# How are the topics are distributed through the years ?
# What look like a word cloud of the titles of the studies ?
# What is the distribution of Master and PhD thesis (by year, country, number of pages)
### WHAT DO YOU NEED ?
# This file is supposed to contain all the information needed to understand, process, and produce data
# 1. Zotero installed
# With a library of references
# - The more cleaned are the references, the better is the analysis
# - The better indexed are the references, also the better is the analysis
# 2. R and R Studio installed
# With all the packages (that should be retrieved and installed at the first run of this code)
# 3. OpenRefine installed (optional)
# If you want to reconcile your data with open linked data online
# It means your will semi-automatically retrieve more data (if it is indexed in Wikidata) to enrich your own data
# For example, with the name of the journal we will retrieve the date of creation of the journal (Inception) and the country of origin of the journal
# Yes, it's awesome!
# 4. Any PDF viewer, to open some graphics in pdf
# 5. OpenOffice Calc, to open (and maybe edit) the csv files
### CREDITS
# ChatGPT 3.5 by OpenAI for a lot of help with back and forth feedback on my R code
# Caroline Patenaude, Data librarian at Université de Montréal for teaching me R & OpenRefine
# Céline Van den Rul at https://towardsdatascience.com/create-a-word-cloud-with-r-bde3e7422e8a for word clouds
# David Tingle at https://davidtingle.com/misc/bib for ideas of analysis to perform
# Zotero development team
# R and R Studio development team
# OpenRefine development team
# Wikidata development team and community of contributors
#####################
# Let's stat.
# Create a new working folder on your computer
# Example : MyZoteroAnalysis
# Go to Zotero > My library > Right click > Export > Format : CSV (Unicode UTF-8)
# Export the file "My library.csv" into your new working folder "MyZoteroAnalysis"
# Copy this "ZoteroRnalysis.R" file into this working folder
# Open the "ZoteroRnalysis.R" file (with R Studio > File > Open File... or double-click on it from the folder)
# Go to this line within R Studio (it should be green, because it's a comment and all comments are ignored by the program so we can write anything there, especially everything that will make the code more understandable)
# Go to RStudio > Session > Set Working Directory > To source file location
# Go in the Console frame
# Copy the text after ">" in the console (it should start with "setwd...")
# Paste and replace the line following that block of comments (because that line is for my computer)
# Then, put the cursor on that line and click Run on the top-right of this frame
setwd("C:/Users/martinop/OneDrive - Universite de Montreal/perso/en_cours/MyZoteroAnalysis")
# Now let's install some packages
# Put the cursor under that block and click Run on the top-right of this frame
# If a package is not installed yet, it will be installed (some stuff will happen in the console under)
# If a package is installed, nothing will happen (just some blues lines in the console)
# Load ggplot2 library if not already loaded
if (!requireNamespace("ggplot2", quietly = TRUE)) {
install.packages("ggplot2")
}
library(ggplot2)
# Repeat
# Load plotly library if not already loaded
if (!requireNamespace("plotly", quietly = TRUE)) {
install.packages("plotly")
}
library(plotly)
# Now you will Run all the line that are not starting with >
# As you can see, Run bring you every time after the lines that were executed so you can just keep clicking on run
# And watch what going on in the console, and the viewer panel sometimes
# Create a specific folder named "output" to place all the outputs of the analysis (PDF, csv and everything else)
output_folder <- "output"
# Check if the folder already exists
if (!dir.exists(output_folder)) {
# If it doesn't exist, create the folder
dir.create(output_folder)
cat("Folder created:", output_folder, "\n")
} else {
cat("Folder already exists:", output_folder, "\n")
}
# Let's load the Zotero references file into R
# If needed replace "My library.csv" with the actual file name (or URL) of your CSV file
file_name <- "My library.csv"
# Read the CSV file into a variable (here a data_frame) with specific options
ZOTEROLIB <- read.table(file_name, header = TRUE, sep = ",", encoding = "UTF-8")
# Function to print variable name and class
print_variable_info <- function(x) {
var_name <- deparse(substitute(x))
var_class <- class(x)
print(paste(var_name, " was created. It's a", var_class, "(class)"))
}
print_variable_info(ZOTEROLIB)
# Now all the references are loaded into this big data frame named ZOTEROLIB
# Sometimes we will run the analysis on subsets of this big collection
# For example, if we want to analyze only the journal articles that are peer-reviewed
# (ie. that are indexed with a tag named peer-reviewed) and that match an another tag
# NB: all my personal tags are starting with the "_" character to signal they are from my thesaurus
# Creating a subset of the dataframe to match certain documents only
# Item.Type = "journalArticle"
# and Tags contains "_peer reviewed"
# and Tags contains "_TTRPG"
# Create a first subset based on your criteria
subset1_ZOTEROLIB <- subset(ZOTEROLIB,
Item.Type == "journalArticle" &
grepl("_peer reviewed", Manual.Tags) &
grepl("_TTRPG", Manual.Tags))
# Create an another subset based on the criteria (Journal article OR book OR book section, AND TTRPG)
subset2_ZOTEROLIB <- subset(ZOTEROLIB,
(Item.Type == "journalArticle" |
Item.Type == "book" |
Item.Type == "bookSection" ) &
grepl("_TTRPG", Manual.Tags))
# Other Item.Type ??
# Here is a short list from https://www.zotero.org/support/kb/item_types_and_fields
# book
# bookSection
# conferencePaper
# journalArticle
# magazineArticle
# newspaperArticle
# thesis
# webpage
# Important : they are case sensitive, they use no space, and their 1st letter is lowercase
# Let's start with some basic analysis
## ANALYSIS : Peer-reviewed articles distributed by year
# Assuming your data frame is named DF with a column Publication.Year
DF <- subset1_ZOTEROLIB
# Assuming Publication.Year is initially stored as character or factor
# Convert it to character if it's factor
if (is.factor(DF$Publication.Year)) {
DF$Publication.Year <- as.character(DF$Publication.Year)
}
# Remove any non-numeric characters and convert to numeric
DF$Publication.Year <- as.numeric(gsub("[^0-9]", "", DF$Publication.Year))
# Count the observations for each date
date_counts <- table(DF$Publication.Year)
# Create a sequence of years from the minimum to the maximum, excluding NA
if(sum(!is.na(DF$Publication.Year)) > 0) { # Check if we have any non-NA values
min_year <- min(DF$Publication.Year, na.rm = TRUE)
max_year <- max(DF$Publication.Year, na.rm = TRUE)
if(is.finite(min_year) && is.finite(max_year)) { # Check if min and max are valid
all_years <- seq(min_year, max_year, by = 1)
print(paste("Sequence created from", min_year, "to", max_year))
} else {
print("Min or max year is not finite")
}
} else {
print("No valid years found in the dataset")
}
# Create a data frame with all years
all_years_df <- data.frame(Year = all_years)
# Merge the existing data with the data frame containing all years
merged_data <- merge(all_years_df, data.frame(Year = names(date_counts), Count = as.numeric(date_counts)), by = "Year", all.x = TRUE)
# Replace NAs with 0 for the count column
merged_data$Count[is.na(merged_data$Count)] <- 0
# Create a bar plot
# Assuming merged_data is your data frame with columns 'Year' and 'Count'
gg_plot <- ggplot(merged_data, aes(x = Year, y = Count)) +
geom_bar(stat = "identity", fill = "skyblue", width = 0.7) +
labs(title = "Peer-reviewed Journal Articles Distributed by Year", x = "Date", y = "Count") +
theme_minimal()
# After runing the line under, a graphic is supposed to be displayed in the frame at the right of this one
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "pr_journalarticles_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Export the data as a CSV file with column names
file_path <- file.path(output_folder, "pr_journalarticles_by_year.csv")
write.csv(merged_data, file = file_path, row.names = FALSE)
# ANALYSIS : Journal titles most frequently utilized for peer-reviewed articles
# Assuming your data frame is named DF with a column Publication.Title
DF <- subset1_ZOTEROLIB
# Use table to count occurrences of each Publication.Title
title_counts <- table(DF$Publication.Title)
# Sort the counts in descending order and select the top 15
top_titles <- head(sort(title_counts, decreasing = TRUE), 15)
all_titles <- sort(title_counts, decreasing = TRUE)
# Create a data frame for the top titles
top_titles_df <- data.frame(Publication.Title = names(top_titles), Count = as.numeric(top_titles))
all_titles_df <- data.frame(Publication.Title = names(all_titles), Count = as.numeric(all_titles))
# Assuming top_titles_df is your data frame with columns 'Publication.Title' and 'Count'
gg_plot <- ggplot(top_titles_df, aes(x = reorder(`Publication.Title`, Count), y = Count)) +
geom_col(fill = "skyblue") +
labs(title = "Top 15 Academic Journals", x = "Journal Titles", y = "Count") +
theme_minimal() +
coord_flip()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "journal_titles_counts.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Export the table as a CSV file with column names
file_path <- file.path(output_folder, "all_journal_titles_counts.csv")
write.csv(all_titles_df, file = file_path, row.names = FALSE)
# ANALYSIS : Listing the top publishers of books and book sections
# Create an another subset based on the criteria (Journal article OR book OR book section, AND TTRPG)
subset3_ZOTEROLIB <- subset(ZOTEROLIB,
( Item.Type == "book" |
Item.Type == "bookSection" ) &
grepl("_TTRPG", Manual.Tags))
DF <- subset3_ZOTEROLIB
# First create separate counts for books and book sections
DF$Item.Type <- as.factor(DF$Item.Type) # Convert to factor if not already
# Create counts by year and type
counts_by_type <- aggregate(rep(1, nrow(DF)),
by = list(Year = DF$Publication.Year,
Type = DF$Item.Type),
FUN = sum)
names(counts_by_type)[3] <- "Count"
# Create a sequence of all years
if(sum(!is.na(DF$Publication.Year)) > 0) {
min_year <- 1974 # as per your manual override
max_year <- max(DF$Publication.Year, na.rm = TRUE)
if(is.finite(min_year) && is.finite(max_year)) {
all_years <- seq(min_year, max_year, by = 1)
print(paste("Sequence created from", min_year, "to", max_year))
# Create a complete dataset with all years and types
types <- unique(DF$Item.Type)
complete_grid <- expand.grid(Year = all_years, Type = types)
# Merge with actual counts
merged_data <- merge(complete_grid, counts_by_type,
by = c("Year", "Type"), all.x = TRUE)
# Replace NAs with 0
merged_data$Count[is.na(merged_data$Count)] <- 0
# Create stacked bar plot
gg_plot <- ggplot(merged_data, aes(x = Year, y = Count, fill = Type)) +
geom_bar(stat = "identity", position = "stack", width = 0.7) +
scale_fill_manual(values = c("book" = "skyblue", "bookSection" = "darkblue")) +
labs(title = "Books and Book Sections Distribution by Year",
x = "Year",
y = "Count",
fill = "Type") +
theme_minimal() +
theme(legend.position = "bottom")
# Print the plot
print(gg_plot)
# Save the plot
file_path <- file.path(output_folder, "books_booksections_by_year_breakdown.pdf")
ggsave(file_path, plot = gg_plot, width = 10, height = 6)
# Save the data
file_path <- file.path(output_folder, "books_booksections_by_year_breakdown.csv")
write.csv(merged_data, file = file_path, row.names = FALSE)
} else {
print("Min or max year is not finite")
}
} else {
print("No valid years found in the dataset")
}
# Listing the top publishers of books and book sections
# Load required libraries
library(dplyr)
# Merge inconsistent publisher names
DF <- DF %>%
filter(!grepl("Place to Go", Publisher)) %>% # Remove "Place to Go" entries
mutate(Publisher = case_when(
grepl("Wiley", Publisher) ~ "Wiley",
grepl("McFarland", Publisher) ~ "McFarland",
grepl("MIT Press", Publisher) ~ "MIT Press",
grepl("ETC Press", Publisher) ~ "ETC Press",
grepl("Springer", Publisher) ~ "Springer",
TRUE ~ Publisher
))
# Create count by publisher and type
publisher_type_counts <- DF %>%
filter(!is.na(Publisher) & nzchar(Publisher)) %>%
group_by(Publisher, Item.Type) %>%
summarise(Count = n(), .groups = 'drop') %>%
arrange(desc(Count))
# Get top 15 publishers by total count
top_publishers <- publisher_type_counts %>%
group_by(Publisher) %>%
summarise(Total = sum(Count)) %>%
arrange(desc(Total)) %>%
head(15) %>%
pull(Publisher)
# Filter data for top publishers
top_publisher_counts <- publisher_type_counts %>%
filter(Publisher %in% top_publishers)
# Create the plot
gg_plot <- ggplot(top_publisher_counts,
aes(x = reorder(Publisher, Count),
y = Count,
fill = Item.Type)) +
geom_bar(stat = "identity", position = "stack") +
scale_fill_manual(values = c("book" = "skyblue", "bookSection" = "darkblue")) +
labs(title = "Top 15 Publishers for Books and Book Sections",
x = "Publisher",
y = "Count",
fill = "Publication Type") +
theme_minimal() +
coord_flip() +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
legend.position = "bottom")
# Print the plot
print(gg_plot)
# Save the plot
file_path <- file.path(output_folder, "top_presses_counts_by_type.pdf")
ggsave(file_path, plot = gg_plot, width = 12, height = 8)
# Create and save detailed CSV
all_publisher_counts <- publisher_type_counts %>%
arrange(desc(Count))
file_path <- file.path(output_folder, "all_presses_counts_by_type.csv")
write.csv(all_publisher_counts, file = file_path, row.names = FALSE)
# Optional: Create a summary table with totals
summary_table <- publisher_type_counts %>%
group_by(Publisher) %>%
summarise(
Books = sum(Count[Item.Type == "book"]),
Book_Sections = sum(Count[Item.Type == "bookSection"]),
Total = sum(Count)
) %>%
arrange(desc(Total))
# Save summary table
file_path <- file.path(output_folder, "publisher_summary_by_type.csv")
write.csv(summary_table, file = file_path, row.names = FALSE)
# ANALYSIS : Language used in the journal articles
# Assuming your data frame is named DF with a column Language
DF <- subset1_ZOTEROLIB
# Use table to count occurrences of each language code
language_counts <- table(DF$Language)
# Convert language_counts to a data frame
language_df <- data.frame(language = names(language_counts), count = as.numeric(language_counts))
# Deprecated : Create the pie chart
# gg_plot <- ggplot(language_df, aes(x = "", y = count, fill = language)) +
# geom_bar(stat = "identity", width = 1, color = "white") +
# coord_polar(theta = "y") +
# theme_void() +
# scale_fill_brewer(palette = "Set3") + # You can choose a different color palette
# labs(title = "Distribution of Language Codes in Peer-reviewed Journal Articles", fill = "Language")
# Install and load required packages if not already done
if (!requireNamespace("treemapify", quietly = TRUE)) {
install.packages("treemapify")
}
library(treemapify)
# Create the treemap
gg_plot <- ggplot(language_df,
aes(area = count,
fill = language,
label = paste(language, "\n", count))) +
geom_treemap() +
geom_treemap_text(colour = "black",
place = "centre",
size = 15) +
scale_fill_brewer(palette = "Set3") +
theme(legend.position = "none") + # Remove legend since values are shown in boxes
labs(title = "Distribution of Language Codes in Peer-reviewed Journal Articles")
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "language_counts.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the language counts to a CSV file
file_path <- file.path(output_folder, "language_counts.csv")
write.csv(language_counts, file = file_path, row.names = FALSE)
# ANALYSIS : Most mentioned authors in journal articles
# Assuming your data frame is named DF with a column Authors
DF <- subset1_ZOTEROLIB
# Convert Authors column to character to ensure consistency
DF$Author <- as.character(DF$Author)
# Split authors into individual names (assuming names are separated by commas)
individual_authors <- strsplit(DF$Author, ";")
# Flatten the list of individual authors into a single vector
all_authors <- unlist(individual_authors)
# Remove leading and trailing whitespaces from author names
all_authors <- trimws(all_authors)
# Create a data frame with the authors and their counts
authors_counts_df <- data.frame(Author = names(table(all_authors)), Count = as.numeric(table(all_authors)))
# Sort the data frame by count in descending order
authors_counts_df <- authors_counts_df[order(-authors_counts_df$Count), ]
# Merge authors with the same name
merged_authors_counts_df <- aggregate(Count ~ Author, data = authors_counts_df, sum)
# Use write.csv to export the merged authors counts to a CSV file
file_path <- file.path(output_folder, "all_authors_counts.csv")
write.csv(merged_authors_counts_df, file = file_path, row.names = FALSE)
# Order the data frame by count in descending order and select the top 20 authors
top_authors <- head(merged_authors_counts_df[order(-merged_authors_counts_df$Count), ], 30)
# Assuming top_titles_df is your data frame with columns 'Author' and 'Count'
gg_plot <- ggplot(top_authors, aes(x = reorder(`Author`, Count), y = Count)) +
geom_col(fill = "skyblue") +
labs(title = "Top 30 Authors with the highest frequency of mentions in articles", x = "Author", y = "Count") +
theme_minimal() +
coord_flip()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "top_authors.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the merged authors counts to a CSV file
file_path <- file.path(output_folder, "top_authors_counts.csv")
write.csv(top_authors, file = file_path, row.names = FALSE)
# Author names gender
## Export the authors_count_df as csv to reconcile with Wikidata and sort the « assumed » gender of the authors based on their first names
file_path <- file.path(output_folder, "all_authors_counts.csv")
write.csv(authors_counts_df, file = file_path, row.names = FALSE)
# Open this csv (all_authors_counts.csv )in OpenRefine
# Column Author > Add a column based on this column
# > New column name = firstname
# > replace value by
# value.split(',')[1].trim().replace(/^[A-Za-z]\.? /, '').split(' ')[0]
# Column Author > Add a column based on this column
# > New column name = givenname_reco
# > replace value by
# value.split(',')[1].trim().replace(/^[A-Za-z]\.? /, '').split(' ')[0]
# Column givenname_reco > Reconcile > Start reconciling > Wikidata : Type = female given name + Start reconciling
# Click on the facet at the right : None (become orange)
# Column givenname_reco > Reconcile > Start reconciling > Wikidata : Type = male given name + Start reconciling
# Column givenname_reco > Reconcile > Start reconciling > Wikidata : Type (in the box) = unisex given name + Start reconciling
# Column givenname_reco > Add a column based on reconciled value > P31 (instance of)
# Rename the column "instance of" into "instance.of"
# Export all as comma separated value : givenname_gender_reconciled.csv
# Pass through the csv data in LibreOffice Calc to correct some remaining mistakes
gender_data <- read.csv("givenname_gender_reconciled.csv", stringsAsFactors = FALSE)
# Initialize an empty list to store rows for the new DataFrame
author_year_gender_rows <- list()
# Iterate through each row of the original DataFrame
for (i in 1:nrow(DF)) {
# Split the authors and corresponding years
authors <- trimws(strsplit(DF$Author[i], ";")[[1]])
year <- DF$Publication.Year[i]
# Create a row for each author with corresponding year and gender
for (author in authors) {
# Find matches in gender_data based on whether the Author column contains the author value
matching_row <- gender_data[grepl(author, gender_data$Author, ignore.case = TRUE), ]
if (nrow(matching_row) > 0) {
gender <- matching_row$`instance.of`
# Take the first value if gender has multiple values
gender <- gender[1]
} else {
# If no gender information found for author, set it as "Unknown"
gender <- "Unknown"
}
author_year_gender_rows[[length(author_year_gender_rows) + 1]] <- c(Author = author,
Publication.Year = year,
Gender = gender)
}
}
# Combine all rows into a DataFrame
DFgender <- do.call(rbind, author_year_gender_rows)
# Convert DF2 to data frame explicitly
DFgender <- as.data.frame(DFgender)
# Convert "Gender" column to a factor with specified levels
DFgender$Gender <- factor(DFgender$Gender, levels = c("male given name", "female given name", "unisex given name", NA))
# Plot the distribution of author_year_gender_rows through the years
gg_plot <- ggplot(DFgender, aes(x = Publication.Year, fill = Gender)) +
geom_bar(position = "stack") +
labs(title = "Distribution of Authors by Gender over Years",
x = "Year", y = "Count") +
scale_fill_manual(values = c("lightblue", "pink", "green", "gray"),
labels = c("Male", "Female", "Unisex", "NA")) +
theme_minimal()
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "gender_distribution_byyear.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the merged authors counts to a CSV file
file_path <- file.path(output_folder, "gender_distribution_byyear.csv")
write.csv(DFgender, file = file_path, row.names = FALSE)
# Open all_authors_counts.csv in OpenRefine
# Column Author > Add a column based on this column
# > New column name = ORCID_reco
# > replace value by
# value => value.split(',')[1].trim() + ' ' + value.split(',')[0].trim()
# Column ORCID_reco > Reconcile : Add service
# Column ORCID_reco > Reconcile > Start reconciling > Person
# Column ORCID_reco > Reconcile > Action > Match each cell to its best candidate
# Column ORCID_reco > Reconcile > Add entity identifier column
# Name = ORCID_ID
# To verify in LibreOffice Calc here is the formulate to do the same in a new column and compare with ORCID_reco : =REGEX(A2; "(.*), (.*)"; "$2 $1")
# Open all_authors_counts.csv in OpenRefine
# Column Author > Add a column based on this column
# > New column name = Wikidata_reco
# Column Wikidata_reco > Reconcile > Start reconciling > Human (Q5)
# Column ORCID_reco > Reconcile > Action > Match each cell to its best candidate
# Column ORCID_reco > Reconcile > Add entity identifier column
# Name = ORCID_ID
# To verify in LibreOffice Calc here is the formulate to do the same in a new column and compare with ORCID_reco : =REGEX(A2; "(.*), (.*)"; "$2 $1")
# ANALYSIS : All your tags in journal articles
# This analysis is assuming you have indexed all your references with tags starting by "_"
# Assuming your data frame is named DF with a column Tags
DF <- subset1_ZOTEROLIB
# Convert Tags column to character to ensure consistency
DF$Manual.Tags <- as.character(DF$Manual.Tags)
# Split tags into individual names (assuming tags are separated by commas)
individual_tags <- strsplit(DF$Manual.Tags, ";")
# Flatten the list of individual tags into a single vector
all_tags <- unlist(individual_tags)
# Remove leading and trailing whitespaces from tag names
all_tags <- trimws(all_tags)
# Create a data frame with the tags and their counts
tags_counts_df <- data.frame(Tag = names(table(all_tags)), Count = as.numeric(table(all_tags)))
# Sort the data frame by count in descending order
tags_counts_df <- tags_counts_df[order(-tags_counts_df$Count), ]
# Merge tags with the same name
merged_tags_counts_df <- aggregate(Count ~ Tag, data = tags_counts_df, sum)
# Use write.csv to export the merged tags counts to a CSV file
file_path <- file.path(output_folder, "all_tags_counts.csv")
write.csv(merged_tags_counts_df, file = file_path, row.names = FALSE)
# Order the data frame by count in descending order and select the top 150 tags
top_tags <- head(merged_tags_counts_df[order(-merged_tags_counts_df$Count), ], 150)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "top_tags_counts.csv")
write.csv(top_tags, file = file_path, row.names = FALSE)
# ANALYSIS : Treemaping only the tags that are capitalized (1st rank subject-headings)
# It is assuming that you have two levels of tags in your controlled list of tags (your thesaurus)
# The first level of tags is the generic level, these tags are all written in capital letters
# The second level of tags is the specific level, these tags are all written in non capital letters
# You will find information to build you thesaurus here : https://github.com/pmartinolli/TM-MyThesaurus
# Aussi dans le Manuel pratique de recherche documentaire (PLU6058), chapitre 15.2 : https://bib.umontreal.ca/multidisciplinaire/plu6058 (in French)
# An example of two level thesaurus : https://github.com/pmartinolli/TM-MyThesaurus/blob/master/files/TTRPG-simple-thesaurus.pdf
# TL;DR, your thesaurus should be organized like this :
#
# _PSYCHOLOGY
# _therapy
# _mental disorder
# _cognition
# _well-being
#
# Every time you add a specific tag (non capital letter), you should add also the related generic TAG (capital letter)
# Filter only the tags written in capital letters
capital_top_tags <- top_tags[grep("^_[A-Z]+$", top_tags$Tag), ]
# Remove the tag _TTRPG from the list (because it's the main tag so it appears everywhere)
capital_top_tags <- capital_top_tags[capital_top_tags$Tag != "_TTRPG", ]
# Filter tags with count 5 and higher
filtered_cap_tags <- capital_top_tags[capital_top_tags$Count >= 5, ]
# Assuming filtered_cap_tags is your data frame with columns 'Tag' and 'Count'
gg_plot <- ggplot(filtered_cap_tags, aes(x = reorder(`Tag`, Count), y = Count)) +
geom_col(fill = "skyblue") +
labs(title = "Top 1st rank tags", x = "Tags", y = "Count") +
theme_minimal() +
coord_flip()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "top_cap_tags.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Create a treemap plot with plotly
treemap_plot <- plot_ly(
data = filtered_cap_tags,
labels = ~Tag,
parents = ~"",
values = ~Count,
type = "treemap",
hoverinfo = "label+value+percent root"
) %>%
layout(title = "Top Tags Treemap (Capitalized = 1st rank in the subject headings thesaurus)")
# Save the plot to an HTML file
file_path <- file.path(output_folder, "top_cap_tags_treemap.html")
htmlwidgets::saveWidget(treemap_plot, file = file_path)
# Display the plot
treemap_plot
# ANALYSIS : Treemaping only the tags that are NOT capitalized (2nd rank subject-headings)
# Filter tags with count 5 and higher
filtered_tags <- top_tags[top_tags$Count >= 5, ]
# Filter only the tags NOT written in capital letters
filtered_tags <- filtered_tags[grep("^_[^A-Z]+$", filtered_tags$Tag), ]
# Filter tags that start with "_"
filtered_tags <- filtered_tags[grepl("^_", filtered_tags$Tag), ]
# Remove the tag _ZOTEROLIB and _peer reviewed from the list
filtered_tags <- filtered_tags[filtered_tags$Tag != "_TTRPG", ]
filtered_tags <- filtered_tags[filtered_tags$Tag != "_peer reviewed", ]
# Assuming you have already created the 'filtered_tags' data frame
# Assuming filtered_tags is your data frame with columns 'Tag' and 'Count'
gg_plot <- ggplot(head(filtered_tags[order(-filtered_tags$Count), ], 30), aes(x = reorder(Tag, Count), y = Count)) +
geom_col(fill = "skyblue") +
labs(title = "Top 30 tags - 2nd rank in the subject headings", x = "Tags", y = "Count") +
theme_minimal() +
coord_flip()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "top_noncap_tags.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Create a treemap plot with plotly
treemap_plot <- plot_ly(
data = filtered_tags,
labels = ~Tag,
parents = ~"",
values = ~Count,
type = "treemap",
hoverinfo = "label+value+percent root"
) %>%
layout(title = "Top Tags Treemap (Count >= 5) (2nd rank in the subject headings thesaurus)")
# Save the plot to an HTML file
file_path <- file.path(output_folder, "noncap_tags_treemap.html")
htmlwidgets::saveWidget(treemap_plot, file = file_path)
# Display the plot
treemap_plot
# ANALYSIS : the distribution of the 1st rank tags per year
# Assuming your data frame is named DF with a column Tags
DF <- subset1_ZOTEROLIB
# Create an empty dataframe DDFF
DDFF <- data.frame()
# Iterate over unique Publication.Year values
for (year in unique(DF$Publication.Year)) {
# Extract tags for the current year
year_tags <- unlist(strsplit(gsub(" ", "", DF$Manual.Tags[DF$Publication.Year == year]), ";"))
# Initialize a data frame for the current year
year_df <- data.frame(
Publication.Year = rep(year, length(filtered_cap_tags$Tag)),
Tags = filtered_cap_tags$Tag,
Count = 0
)
# Iterate over tags and update Count using regex
for (i in seq_along(year_df$Tags)) {
tag <- year_df$Tags[i]
regex_pattern <- paste0("\\b", tag, "\\b") # Use word boundaries to match whole tags
year_df$Count[i] <- sum(grepl(regex_pattern, DF$Manual.Tags[DF$Publication.Year == year]))
}
# Bind the dataframe to DDFF
DDFF <- rbind(DDFF, year_df)
}
# Reset row names
rownames(DDFF) <- NULL
# Plot the bar chart
gg_plot <- ggplot(DDFF, aes(x = Publication.Year, y = Count, fill = Tags)) +
geom_bar(stat = "identity", position = "stack") +
labs(title = "Distribution of tags throught the years",
x = "Publication Year",
y = "Count") +
theme_minimal()
print(gg_plot)
# Save the ggplot to a PDF file
file_path <- file.path(output_folder, "tags_distributed_by_year.pdf")
ggsave(file_path, plot = gg_plot, width = 8, height = 6)
# Use write.csv to export the top tags to a CSV file
file_path <- file.path(output_folder, "tags_distributed_by_year.csv")
write.csv(DDFF, file = file_path, row.names = FALSE)