-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdataload.py
101 lines (86 loc) · 3.86 KB
/
dataload.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#!/usr/bin/env python
# encoding: utf-8
"""
Functions for loading data from csv files
"""
import os
import numpy as np
import pandas as pd
import pymatgen as mg
def load_features(data_dir, with_ext=True, with_geo=True, expectO=0.6):
"""
Function to load data from data_dir and output train and test dataframes.
Also adds features:
- cell volume
- atomic and mass density
- unit cell angles in radians
- a combined spacegroup and natoms category
- number and fraction of O atoms
"""
# Load basic features
train = pd.read_csv(os.path.join(data_dir, 'train.csv'),
names=['id', 'spacegroup', 'natoms', 'al',
'ga', 'in', 'a', 'b', 'c',
'alpha', 'beta',
'gamma', 'E0',
'bandgap'],
header=0,
sep=',')
test = pd.read_csv(os.path.join(data_dir, 'test.csv'),
names=['id', 'spacegroup', 'natoms', 'al',
'ga', 'in', 'a', 'b', 'c',
'alpha', 'beta',
'gamma'],
header=0,
sep=',')
# Load extra features from xyz files and element properties
if with_ext:
train_ext = pd.read_csv(os.path.join(data_dir, 'train_ext.csv'),
header=0,
sep=',')
test_ext = pd.read_csv(os.path.join(data_dir, 'test_ext.csv'),
header=0,
sep=',')
train = train.merge(train_ext, on='id')
test = test.merge(test_ext, on='id')
# Load geometry features from xyz files processed from crystal graph
if with_geo:
train_geo = pd.read_csv(os.path.join(data_dir, 'train_geo.csv'),
header=0,
sep=',')
test_geo = pd.read_csv(os.path.join(data_dir, 'test_geo.csv'),
header=0,
sep=',')
train = train.merge(train_geo, on='id')
test = test.merge(test_geo, on='id')
# Add the spacegroup_natoms category
train['spacegroup_natoms'] = train['spacegroup'].astype(str) +\
'_' + train['natoms'].astype(int).astype(str)
test['spacegroup_natoms'] = test['spacegroup'].astype(str) +\
'_' + test['natoms'].astype(int).astype(str)
# Add the cell volume and calculate atomic and mass densities
train['cellvol'] = train.apply(lambda r: mg.Lattice.from_parameters(
r['a'], r['b'], r['c'], r['alpha'], r['beta'], r['gamma']).volume,
axis=1)
test['cellvol'] = test.apply(lambda r: mg.Lattice.from_parameters(
r['a'], r['b'], r['c'], r['alpha'], r['beta'], r['gamma']).volume,
axis=1)
train['atom_density'] = train['natoms'] / train['cellvol']
test['atom_density'] = test['natoms'] / test['cellvol']
if with_ext or with_geo:
train['mass_density'] = train['avg_mass'] / train['cellvol']
test['mass_density'] = test['avg_mass'] / test['cellvol']
# Convert angles to radians
train['alpha_r'] = np.radians(train['alpha'])
train['beta_r'] = np.radians(train['beta'])
train['gamma_r'] = np.radians(train['gamma'])
test['alpha_r'] = np.radians(test['alpha'])
test['beta_r'] = np.radians(test['beta'])
test['gamma_r'] = np.radians(test['gamma'])
if with_ext or with_geo:
# Check O fraction is the expected value for all
train['o_fraction'] = train['o_cnt'] / train['natoms']
test['o_fraction'] = test['o_cnt'] / test['natoms']
assert (train['o_fraction'] == expectO).all()
assert (test['o_fraction'] == expectO).all()
return train, test