Skip to content

1.0.5

Latest
Compare
Choose a tag to compare
@msorkhpar msorkhpar released this 12 Jun 02:23
· 17 commits to main since this release

We have generated datasets using A Brief History of Human Time project.
These datasets contain different sets of seed nodes, categorized by various human arts and professions.

dataset (variant, size, None/train/val/test) #roots #smmaries #nodes #edges #labels roots category distribution Running Time(sec)
WikiLitArt-s
csv, graphml, croissant.json
494 10416 85346 136950 547 actor=150
composer=35
film=41
novelist=24
painter=59
poet=39
screenwriter=17
singer=72
writer=57
91.934
WikiLitArt-s-train
csv, graphml, croissant.json
346 7234 61885 96497 508 actor=105
composer=24
film=29
novelist=17
painter=42
poet=27
screenwriter=12
singer=50
writer=40
66.023
WikiLitArt-s-val
csv, graphml, croissant.json
74 1572 14763 20795 340 actor=23
composer=5
film=6
novelist=4
painter=9
poet=6
screenwriter=2
singer=11
writer=8
14.364
WikiLitArt-s-test
csv, graphml, croissant.json
74 1626 15861 22029 350 actor=22
composer=6
film=6
novelist=3
painter=8
poet=6
screenwriter=3
singer=11
writer=9
14.6
WikiLitArt-m
csv, graphml, croissant.json
494 10416 128061 220263 604 actor=150
composer=35
film=41
novelist=24
painter=59
poet=39
screenwriter=17
singer=72
writer=57
155.368
WikiLitArt-m-train
csv, graphml, croissant.json
346 7234 93251 155667 566 actor=105
composer=24
film=29
novelist=17
painter=42
poet=27
screenwriter=12
singer=50
writer=40
111.636
WikiLitArt-m-val
csv, graphml, croissant.json
74 1572 22214 33547 375 actor=23
composer=5
film=6
novelist=4
painter=9
poet=6
screenwriter=2
singer=11
writer=8
22.957
WikiLitArt-m-test
csv, graphml, croissant.json
74 1626 24130 35980 394 actor=22
composer=6
film=6
novelist=3
painter=8
poet=6
screenwriter=3
singer=11
writer=9
26.187
WikiLitArt-l
csv, graphml, croissant.json
494 10416 239491 466905 703 actor=150
composer=35
film=41
novelist=24
painter=59
poet=39
screenwriter=17
singer=72
writer=57
353.113
WikiLitArt-l-train
csv, graphml, croissant.json
346 7234 176057 332279 661 actor=105
composer=24
film=29
novelist=17
painter=42
poet=27
screenwriter=12
singer=50
writer=40
244.544
WikiLitArt-l-val
csv, graphml, croissant.json
74 1572 42745 71734 446 actor=23
composer=5
film=6
novelist=4
painter=9
poet=6
screenwriter=2
singer=11
writer=8
57.263
WikiLitArt-l-test
csv, graphml, croissant.json
74 1626 46890 77931 493 actor=22
composer=6
film=6
novelist=3
painter=8
poet=6
screenwriter=3
singer=11
writer=9
60.466
WikiCinema-s
csv, graphml, croissant.json
493 11750 70753 126915 469 actor=405
film=88
118.014
WikiCinema-s-train
csv, graphml, croissant.json
345 8374 52712 89306 437 actor=284
film=61
84.364
WikiCinema-s-val
csv, graphml, croissant.json
73 1650 13362 19280 305 actor=59
film=14
18.651
WikiCinema-s-test
csv, graphml, croissant.json
75 1744 14777 21567 313 actor=62
film=13
19.851
WikiCinema-m
csv, graphml, croissant.json
493 11750 101529 196061 541 actor=405
film=88
196.413
WikiCinema-m-train
csv, graphml, croissant.json
345 8374 75900 138897 491 actor=284
film=61
142.091
WikiCinema-m-val
csv, graphml, croissant.json
73 1650 19674 30152 344 actor=59
film=14
31.722
WikiCinema-m-test
csv, graphml, croissant.json
75 1744 22102 34499 342 actor=62
film=13
33.674
WikiCinema-l
csv, graphml, croissant.json
493 11750 185098 397546 614 actor=405
film=88
475.679
WikiCinema-l-train
csv, graphml, croissant.json
345 8374 139598 284417 575 actor=284
film=61
333.148
WikiCinema-l-val
csv, graphml, croissant.json
73 1650 37352 63744 412 actor=59
film=14
68.62
WikiCinema-l-test
csv, graphml, croissant.json
75 1744 43238 74205 426 actor=62
film=13
87.07
WikiPro-s
csv, graphml, croissant.json
493 9853 79825 125912 616 actor=58
football=156
journalist=14
lawyer=16
painter=23
player=25
politician=125
singer=27
sport=21
writer=28
126.119
WikiPro-s-train
csv, graphml, croissant.json
345 6832 57529 87768 575 actor=41
football=109
journalist=10
lawyer=11
painter=16
player=17
politician=87
singer=19
sport=15
writer=20
89.874
WikiPro-s-val
csv, graphml, croissant.json
74 1548 15769 21351 405 actor=9
football=23
journalist=2
lawyer=3
painter=3
player=4
politician=19
singer=4
sport=3
writer=4
21.021
WikiPro-s-test
csv, graphml, croissant.json
74 1484 15657 21145 384 actor=8
football=24
journalist=2
lawyer=2
painter=4
player=4
politician=19
singer=4
sport=3
writer=4
21.743
WikiPro-m
csv, graphml, croissant.json
493 9853 119305 198663 670 actor=58
football=156
journalist=14
lawyer=16
painter=23
player=25
politician=125
singer=27
sport=21
writer=28
208.157
WikiPro-m-train
csv, graphml, croissant.json
345 6832 86434 138676 633 actor=41
football=109
journalist=10
lawyer=11
painter=16
player=17
politician=87
singer=19
sport=15
writer=20
141.563
WikiPro-m-val
csv, graphml, croissant.json
74 1548 24230 34636 463 actor=9
football=23
journalist=2
lawyer=3
painter=3
player=4
politician=19
singer=4
sport=3
writer=4
36.045
WikiPro-m-test
csv, graphml, croissant.json
74 1484 24117 34157 462 actor=8
football=24
journalist=2
lawyer=2
painter=4
player=4
politician=19
singer=4
sport=3
writer=4
36.967
WikiPro-l
csv, graphml, croissant.json
493 9853 230442 412766 769 actor=58
football=156
journalist=14
lawyer=16
painter=23
player=25
politician=125
singer=27
sport=21
writer=28
489.409
WikiPro-l-train
csv, graphml, croissant.json
345 6832 166685 290069 725 actor=41
football=109
journalist=10
lawyer=11
painter=16
player=17
politician=87
singer=19
sport=15
writer=20
334.864
WikiPro-l-val
csv, graphml, croissant.json
74 1548 48205 74387 549 actor=9
football=23
journalist=2
lawyer=3
painter=3
player=4
politician=19
singer=4
sport=3
writer=4
84.089
WikiPro-l-test
csv, graphml, croissant.json
74 1484 47981 72845 546 actor=8
football=24
journalist=2
lawyer=2
painter=4
player=4
politician=19
singer=4
sport=3
writer=4
92.545
WikiProFem-s
csv, graphml, croissant.json
468 8338 79926 123193 571 actor=141
athletic=25
football=24
journalist=16
painter=16
player=32
politician=81
singer=69
sport=18
writer=46
177.63
WikiProFem-s-train
csv, graphml, croissant.json
330 5587 58329 87492 521 actor=98
athletic=18
football=17
journalist=9
painter=13
player=22
politician=57
singer=48
sport=14
writer=34
127.614
WikiProFem-s-val
csv, graphml, croissant.json
68 1367 14148 19360 344 actor=21
athletic=4
football=3
journalist=4
painter=1
player=5
politician=13
singer=11
sport=1
writer=5
29.081
WikiProFem-test
csv, graphml, croissant.json
70 1387 13642 18567 360 actor=22
athletic=3
football=4
journalist=3
painter=2
player=5
politician=11
singer=10
sport=3
writer=7
27.466
WikiProFem-m
csv, graphml, croissant.json
468 8338 122728 196838 631 actor=141
athletic=25
football=24
journalist=16
painter=16
player=32
politician=81
singer=69
sport=18
writer=46
301.718
WikiProFem-m-train
csv, graphml, croissant.json
330 5587 89922 140505 600 actor=98
athletic=18
football=17
journalist=9
painter=13
player=22
politician=57
singer=48
sport=14
writer=34
217.699
WikiProFem-m-val
csv, graphml, croissant.json
68 1367 21978 31230 409 actor=21
athletic=4
football=3
journalist=4
painter=1
player=5
politician=13
singer=11
sport=1
writer=5
46.793
WikiProFem-m-test
csv, graphml, croissant.json
70 1387 21305 29919 394 actor=22
athletic=3
football=4
journalist=3
painter=2
player=5
politician=11
singer=10
sport=3
writer=7
46.317
WikiProFem-l
csv, graphml, croissant.json
468 8338 248012 413895 722 actor=141
athletic=25
football=24
journalist=16
painter=16
player=32
politician=81
singer=69
sport=18
writer=46
768.99
WikiProFem-l-train
csv, graphml, croissant.json
330 5587 183710 297686 676 actor=98
athletic=18
football=17
journalist=9
painter=13
player=22
politician=57
singer=48
sport=14
writer=34
544.893
WikiProFem-l-val
csv, graphml, croissant.json
68 1367 46018 67193 492 actor=21
athletic=4
football=3
journalist=4
painter=1
player=5
politician=13
singer=11
sport=1
writer=5
116.758
WikiProFem-l-test
csv, graphml, croissant.json
70 1387 44193 63563 472 actor=22
athletic=3
football=4
journalist=3
painter=2
player=5
politician=11
singer=10
sport=3
writer=7
118.524

Dataset Parameters

Parameter Value
Min valid summary edges 5
Random walk depth length 3
Min random walk number-small 100
Min random walk number-medium 150
Min random walk number-large 300
Max random walk number-small 300
Max random walk number-medium 600
Max random walk number-large 1800
Bridges number 5

Graph Structure

In the following you can see a sample of the graph format (we highly recommend using our toolkit to load the datasets):

CSV Format

After unzipping {variant}-{size}-{dataset_type}.zip file, you will find the following CSV files:

{variant}-{size}-{dataset_type}-entities.csv contains entities. An entity is a Wikidata item (node) in our
dataset.

Field Description datatype
id incremental integer starting by zero int
entity Wikidata qid, e.g. Q76 string
wikidata_label Wikidata label (nullable) string
wikidata_desc Wikidata description (nullable) string
wikipedia_title Wikipedia title (nullable) string
wikipedia_id Wikipedia page id (nullable) long

{variant}-{size}-{dataset_type}-root-entities.csv contains root entities. A root entity is a seed node
described previously.

Field Description datatype
entity id key in {variant}-{size}-{dataset_type}-entities.csv int
category category string

{variant}-{size}-{dataset_type}-predicates.csv contains predicates. A predicate is a Wikidata property or a
describing
a connection.

Field Description datatype
id incremental integer starting by zero int
predicate Wikidata Property id, e.g. P121 string
predicate_label Wikidata Property label (nullable) string
predicate_desc Wikidata Property description (nullable) string

{variant}-{size}-{dataset_type}-triples.csv contains triples. A triple is an edge between two entities with a
predicate.

Field Description datatype
subject id key in {variant}-{size}-{dataset_type}-entities.csv int
predicate id key in {variant}-{size}-{dataset_type}-predicates.csv int
object id key in {variant}-{size}-{dataset_type}-entities.csv int

{viariant}_{size}_{dataset_type}-ground-truths.csv contains ground truth triples. A ground truth triple is an
edge that
is marked as a summary for a root entity.

Field Description datatype
root_entity entity in {variant}-{size}-{dataset_type}-root-entities.csv int
subject id key in {variant}-{size}-{dataset_type}-entities.csv int
predicate id key in {variant}-{size}-{dataset_type}-predicates.csv int
object id key in {variant}-{size}-{dataset_type}-entities.csv int

Note: for this file one of the columns subject or object is equal to the root_entity.

Example of CSV Files

# entities.csv
id,entity,wikidata_label,wikidata_desc,wikipedia_title,wikipedia_id
0,Q43416,Keanu Reeves,Canadian actor (born 1964),Keanu_Reeves,16603
1,Q3820,Beirut,capital and largest city of Lebanon,Beirut,37428
2,Q639669,musician,person who composes, conducts or performs music,Musician,38284
3,Q219150,Constantine,2005 film directed by Francis Lawrence,Constantine_(film),1210303
# root-entities.csv
entity,category
0,Q43416,actor
# predicates.csv
id,predicate,predicate_label,predicate_desc
0,P19,place of birth,location where the subject was born
1,P106,occupation,occupation of a person; see also "field of work" (Property:P101), "position held" (Property:P39)
2,P161,cast member,actor in the subject production [use "character role" (P453) and/or "name of the character role" (P4633) as qualifiers] [use "voice actor" (P725) for voice-only role]
# triples.csv
subject,predicate,object
0,0,1
0,1,2
3,2,0
# ground-truth.csv
root_entity,subject,predicate,object
0,0,0,1
3,3,2,0

GraphML Example

The same graph can be represented in GraphML format.

<?xml version="1.0" encoding="UTF-8"?>
<graphml xmlns="http://graphml.graphdrawing.org/xmlns" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
         xsi:schemaLocation="http://graphml.graphdrawing.org/xmlns http://graphml.graphdrawing.org/xmlns/1.0/graphml.xsd">
    <key id="d9" for="edge" attr.name="summary_for" attr.type="string"/>
    <key id="d8" for="edge" attr.name="predicate_desc" attr.type="string"/>
    <key id="d7" for="edge" attr.name="predicate_label" attr.type="string"/>
    <key id="d6" for="edge" attr.name="predicate" attr.type="string"/>
    <key id="d5" for="node" attr.name="category" attr.type="string"/>
    <key id="d4" for="node" attr.name="is_root" attr.type="boolean"/>
    <key id="d3" for="node" attr.name="wikidata_desc" attr.type="string"/>
    <key id="d2" for="node" attr.name="wikipedia_title" attr.type="string"/>
    <key id="d1" for="node" attr.name="wikipedia_id" attr.type="long"/>
    <key id="d0" for="node" attr.name="wikidata_label" attr.type="string"/>
    <graph edgedefault="directed">
        <node id="Q43416">
            <data key="d0">Keanu Reeves</data>
            <data key="d1">16603</data>
            <data key="d2">Keanu_Reeves</data>
            <data key="d3">Canadian actor (born 1964)</data>
            <data key="d4">True</data>
            <data key="d5">actor</data>
        </node>
        <node id="Q3820">
            <data key="d0">Beirut</data>
            <data key="d1">37428</data>
            <data key="d2">Beirut</data>
            <data key="d3">capital and largest city of Lebanon</data>
        </node>
        <node id="Q639669">
            <data key="d0">musician</data>
            <data key="d1">38284</data>
            <data key="d2">Musician</data>
            <data key="d3">person who composes, conducts or performs music</data>
        </node>
        <node id="Q219150">
            <data key="d0">Constantine</data>
            <data key="d1">1210303</data>
            <data key="d2">Constantine_(film)</data>
            <data key="d3">2005 film directed by Francis Lawrence</data>
        </node>
        <edge source="Q43416" target="Q3820" id="P19">
            <data key="d6">P19</data>
            <data key="d7">place of birth</data>
            <data key="d8">location where the subject was born</data>
            <data key="d9">Q43416</data>
        </edge>
        <edge source="Q43416" target="Q639669" id="P106">
            <data key="d6">P106</data>
            <data key="d7">occupation</data>
            <data key="d8">occupation of a person; see also "field of work" (Property:P101), "position held"
                (Property:P39)
            </data>
        </edge>
        <edge source="Q219150" target="Q43416" id="P106">
            <data key="d6">P161</data>
            <data key="d7">cast member</data>
            <data key="d8">actor in the subject production [use "character role" (P453) and/or "name of the character
                role" (P4633) as qualifiers] [use "voice actor" (P725) for voice-only role]
            </data>
            <data key="d9">Q43416</data>
        </edge>
    </graph>
</graphml>