-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdata_utils.py
158 lines (123 loc) · 5.76 KB
/
data_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import pandas as pd
import csv
def remove_columns_from_csv(csv_file, columns_to_remove, output_csv_file=None):
"""
Removes specified columns from a CSV file and saves the result.
Parameters:
- csv_file: Path to the input CSV file.
- columns_to_remove: List of column names to be removed.
- output_csv_file: Path to the output CSV file. If None, overwrites the input file.
Returns:
None
"""
# Load the dataset
df = pd.read_csv(csv_file)
# Remove the specified columns
df.drop(columns=columns_to_remove, errors='ignore', inplace=True)
# Determine the output file path
if output_csv_file is None:
output_csv_file = csv_file
# Save the modified DataFrame back to a CSV file
df.to_csv(output_csv_file, index=False)
print(f"File saved successfully to {output_csv_file} with specified columns removed.")
def remove_hourly_data_from_column(csv_file, column_name, output_csv_file=None):
"""
Removes hourly data (everything after "T") from a specified column in a CSV file and saves the result.
Parameters:
- csv_file: Path to the input CSV file.
- column_name: Name of the column to modify.
- output_csv_file: Path to the output CSV file. If None, overwrites the input file.
Returns:
None
"""
# Load the dataset
df = pd.read_csv(csv_file)
# Modify the specified column to remove hourly data
df[column_name] = df[column_name].apply(lambda x: x.split('T')[0])
# Determine the output file path
if output_csv_file is None:
output_csv_file = csv_file
# Save the modified DataFrame back to a CSV file
df.to_csv(output_csv_file, index=False)
print(f"File saved successfully to {output_csv_file} with hourly data removed from {column_name}.")
def convert_amount_to_int(csv_file, column_name='Amount', output_csv_file=None):
"""
Converts the values in the "Amount" column from float to int in a CSV file and saves the result.
Parameters:
- csv_file: Path to the input CSV file.
- column_name: Name of the column to modify, default is 'Amount'.
- output_csv_file: Path to the output CSV file. If None, overwrites the input file.
Returns:
None
"""
# Load the dataset
df = pd.read_csv(csv_file)
# Convert the specified column to int
df[column_name] = df[column_name].astype(int)
# Determine the output file path
if output_csv_file is None:
output_csv_file = csv_file
# Save the modified DataFrame back to a CSV file
df.to_csv(output_csv_file, index=False)
print(f"File saved successfully to {output_csv_file} with {column_name} converted to int.")
# Example usage (uncomment the following lines to test)
# csv_file = 'merged_SRC20_data.csv'
# columns_to_remove = ['id', 'tick_hash', 'creator', 'deci', 'lim', 'max', 'locked_amt', 'locked_block', 'creator_bal', 'creator_name', 'destination_name']
# remove_columns_from_csv(csv_file, columns_to_remove)
def add_count_unique_column(input_file_path, output_file_path):
"""
Adds a 'count_unique' column to a CSV file that counts the unique collections per address.
Parameters:
- input_file_path: Path to the existing CSV file.
- output_file_path: Path to the new (or the same) CSV file with the added 'count_unique' column.
"""
# Read the existing data
with open(input_file_path, mode='r', newline='') as file:
reader = csv.DictReader(file)
data = list(reader)
# Assuming the first column is 'address' and the last is 'count__total'
fieldnames = reader.fieldnames + ['count_unique'] # Add 'count_unique' to the list of fieldnames
# Calculate count_unique for each row
for row in data:
# Count how many collections each address is in, excluding 'address' and 'count__total' columns
unique_collections = sum(1 for key, value in row.items() if key.startswith('count_') and key != 'count__total' and int(value) > 0)
row['count_unique'] = unique_collections
# Write the updated data to a new CSV file
with open(output_file_path, mode='w', newline='') as file:
writer = csv.DictWriter(file, fieldnames=fieldnames)
writer.writeheader()
writer.writerows(data)
#input_file_path = './data/collections/col-all_nft_holders.collection_count.csv'
#output_file_path = './data/collections/col-all_nft_holders.collection_count.final.csv'
#add_count_unique_column(input_file_path, output_file_path)
def count_unique_addresses(csv_file_path, column_name='destination'):
"""
Counts the number of unique addresses in a specified column of a CSV file.
Parameters:
- csv_file_path: Path to the CSV file.
- column_name: Name of the column containing addresses. Defaults to 'destination'.
Returns:
The number of unique addresses.
"""
unique_addresses = set()
try:
with open(csv_file_path, mode='r', encoding='utf-8') as csv_file:
csv_reader = csv.DictReader(csv_file)
for row in csv_reader:
address = row[column_name]
unique_addresses.add(address)
print(f"Total unique addresses in column '{column_name}': {len(unique_addresses)}")
return len(unique_addresses)
except FileNotFoundError:
print(f"File not found: {csv_file_path}")
return None
except KeyError:
print(f"Column '{column_name}' does not exist in the CSV file.")
return None
csv_file_path = './data/merged_SRC20_data_prepped.csv'
column_name = 'destination' # Adjust the column name as needed
count_unique_addresses(csv_file_path, column_name)
#csv_file = 'final_balances_snapshot_src20.csv'
# column_name = 'Block Time'
# remove_hourly_data_from_column(csv_file, column_name)
#convert_amount_to_int(csv_file)