-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcorpus-list.yaml
160 lines (148 loc) · 5.05 KB
/
corpus-list.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
- shortname: shc
title: Shakespeare His Contemporaries
categories: literature
authors: multiple
centuries: 16th, 17th
languages: eng
text:
markup: TEI-Simple
url: 'https://github.com/JonathanReeve/corpus-SHC.git'
file-format: git
- shortname: folger-shakespeare
title: Folger Shakespeare Library Digital Texts
categories: literature
centuries: 16th, 17th
authors: single
languages: eng
homepage: http://www.folgerdigitaltexts.org/
url-source: http://www.folgerdigitaltexts.org/download/
text:
- markup: TEI
url: http://www.folgerdigitaltexts.org/download/xml/FolgerDigitalTexts_XML_Complete.zip
file-format: zip
- markup: HTML
url: http://www.folgerdigitaltexts.org/download/html/FolgerDigitalTexts_HTML_Complete.zip
file-format: zip
- markup: TXT
url: http://www.folgerdigitaltexts.org/download/txt/FolgerDigitalTexts_TXT_Complete.zip
file-format: zip
- shortname: perseus-c-greek
title: Perseus Canonical Greek
categories: classics
authors: multiple
languages: grc
text:
markup: TEI
url: https://github.com/PerseusDL/canonical-greekLit.git
file-format: git
- shortname: perseus-c-latin
title: Perseus Canonical Latin
categories: classics
authors: multiple
languages: lat
text:
markup: TEI
url: https://github.com/PerseusDL/canonical-latinLit.git
file-format: git
- shortname: stanford-1880s
title: 'Adult British Fiction of the 1880s, Assembled by the Stanford Literary Lab'
categories: literature
centuries: 19th
languages: eng
text:
markup: TXT
url: https://github.com/JonathanReeve/corpus-1880s-all.git
file-format: git
subcorpora:
- shortname: stanford-1880s-male
title: 'Adult British fiction of the 1880s, male authors. Assembled by the Stanford Literary Lab'
text:
markup: txt
url: https://github.com/JonathanReeve/corpus-1880s-male.git
file-format: git
- shortname: stanford-1880s-female
title: 'Adult British fiction of the 1880s, female authors. Assembled by the Stanford Literary Lab'
text:
markup: txt
url: https://github.com/JonathanReeve/corpus-1880s-female.git
file-format: git
- shortname: reuters-21578
title: Reuters-21578
homepage: http://www.daviddlewis.com/resources/testcollections/reuters21578/
categories: history
languages: eng
text:
markup: txt
url: http://www.daviddlewis.com/resources/testcollections/reuters21578/reuters21578.tar.gz
file-format: tar.gz
- shortname: ecco-tcp
title: Eighteenth Century Collections Online / Text Creation Partnership ECCO-TCP
homepage: http://www.textcreationpartnership.org/tcp-ecco/
categories: literature
centuries: 18th
languages: eng
text:
markup: xml
file-format: zip
url:
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200510.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200601.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200604.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200609.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200702.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200802.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200809.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200902.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-200909.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-201004.ecco.zip
- http://www.lib.umich.edu/tcp/docs/texts/ecco/xml-201106.ecco.zip
- shortname: dta
title: Deutsches Textarchiv (German Text Archive)
homepage: http://www.deutschestextarchiv.de/
categories: literature, science, history
centuries: 16th, 17th, 18th, 19th
languages: deu
text:
markup: TEI
file-format: zip
url: http://media.dwds.de/dta/download/dta_komplett_2016-02-11.zip
- shortname: ota
title: Oxford Text Archive
homepage: https://ota.ox.ac.uk/
categories: literature, history
centuries: 16th, 17th, 18th, 19th, 20th
languages: eng, enm, fra, deu, lat, grc
text:
markup: TXT
file-format: git
url: https://github.com/mimno/ota.git
- shortname: txtLAB450
title: txtLAB450, a Multilingual Data Set of Novels
categories: literature
authors: multiple
centuries: 17th, 18th, 19th
languages: eng, fra, deu
text:
markup: TXT
url: 'https://ndownloader.figshare.com/files/3686778'
file-format: zip
- shortname: cenlab
title: CENLab
homepage: https://github.com/JonathanReeve/cenlab
categories: literature
centuries: 18th, 19th, 20th
languages: eng
text:
markup: TXT
file-format: git
url: https://github.com/JonathanReeve/cenlab.git
- shortname: brown
title: Brown Corpus
homepage: http://www.essex.ac.uk/linguistics/external/clmt/w3c/corpus_ling/content/corpora/list/private/brown/brown.html
categories: linguistics
languages: eng
centuries: 20th
text:
markup: TXT
file-format: zip
url: https://github.com/nltk/nltk_data/raw/gh-pages/packages/corpora/brown.zip