-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathgraph_data_03.py
51 lines (35 loc) · 1.16 KB
/
graph_data_03.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import os
import csv
import pandas as pd
import numpy as np
from collections import defaultdict
from tqdm import tqdm
from multiprocessing import Pool
def not_npc(x: str, table, influencers):
return x, (
False
if set(table.loc[table.Source == x, "Target"]) <= influencers
and table.loc[table.Target == x].empty
else True
)
edges = pd.read_csv(os.path.join("polished_data", "comment_graph.csv"))
influencers = set(pd.read_csv(os.path.join("polished_data", "influencers.csv")).Name)
def f(x):
return not_npc(x, edges, influencers)
def main():
source_nodes = set()
print("Indexing source nodes.")
for x in tqdm(edges.Source):
source_nodes.add(x)
print("Removing NPCs.")
with Pool() as p:
relevant = p.imap_unordered(f, list(source_nodes))
good_source_nodes = set()
for x in tqdm(relevant, total=len(source_nodes)):
if x[1]:
good_source_nodes.add(x[0])
edges.loc[edges.Source.apply(lambda x: x in good_source_nodes)].to_csv(
os.path.join("polished_data", "clean_comment_graph.csv"), index=False
)
if __name__ == "__main__":
main()