From c1c1a502c7a8b9e89581aace9bd2264591c74a29 Mon Sep 17 00:00:00 2001
From: Max Ghenis <mghenis@gmail.com>
Date: Thu, 11 Jul 2019 11:10:53 -0700
Subject: [PATCH] Refactor solve_lp_for_year.py

*Not ready to merge*

In filing #323 I looked through this file, and took a stab at streamlining the code. Changes include:
1. Using a `wage_bin` function to create wage bins.
2. Using a loop to assign `lhs_vars` dictionary elements.*
3. Using a loop to assign factor variables.
4. Using an `add_target` function to assign `rhs_vars` dictionary elements.*
5. More explicitly adjust `model_vars` for year 2014.
6. Rename `LP` to `lp` since it's not a constant.
7. Other minor changes like comments.

\* These pieces won't currently work since they attempt to access `globals()` from within a function (`locals()` also won't work). Per https://stackoverflow.com/q/56983782/1840471, there's no good way to access these, and this is really a sign that something could be done better. It seems to me that sticking with the dictionary elements rather than creating new variables would be a clean alternative.

I haven't tested this code yet. Would the right way be to run `make cps-files`?
---
 cps_stage2/solve_lp_for_year.py | 163 ++++++++++++--------------------
 1 file changed, 62 insertions(+), 101 deletions(-)

diff --git a/cps_stage2/solve_lp_for_year.py b/cps_stage2/solve_lp_for_year.py
index 0cb48fc7..a2642aec 100644
--- a/cps_stage2/solve_lp_for_year.py
+++ b/cps_stage2/solve_lp_for_year.py
@@ -48,59 +48,39 @@ def target(target_val, pop, factor, value):
     ucomp = data.UCOMP * s006
 
     # wage distribution
-    wage1 = np.where((data.e00100 <= 10000), data.WAS, 0) * s006
-    wage2 = np.where((data.e00100 > 10000) & (data.e00100 <= 20000),
-                     data.WAS, 0) * s006
-    wage3 = np.where((data.e00100 > 20000) & (data.e00100 <= 30000),
-                     data.WAS, 0) * s006
-    wage4 = np.where((data.e00100 > 30000) & (data.e00100 <= 40000),
-                     data.WAS, 0) * s006
-    wage5 = np.where((data.e00100 > 40000) & (data.e00100 <= 50000),
-                     data.WAS, 0) * s006
-    wage6 = np.where((data.e00100 > 50000) & (data.e00100 <= 75000),
-                     data.WAS, 0) * s006
-    wage7 = np.where((data.e00100 > 75000) & (data.e00100 <= 100000),
-                     data.WAS, 0) * s006
-    wage8 = np.where((data.e00100 > 100000), data.WAS, 0) * s006
-
-    lhs_vars['single_returns'] = single_returns
-    lhs_vars['joint_returns'] = joint_returns
-    lhs_vars['hh_returns'] = hh_returns
-    lhs_vars['returns_w_ss'] = returns_w_ss
-    lhs_vars['dep_exemptions'] = dep_exemptions
-    lhs_vars['interest'] = interest
-    lhs_vars['dividend'] = dividend
-    lhs_vars['biz_income'] = biz_income
-    lhs_vars['biz_loss'] = biz_loss
-    lhs_vars['cap_gain'] = cap_gain
-    lhs_vars['pension'] = pension
-    lhs_vars['sch_e_income'] = sch_e_income
-    lhs_vars['sch_e_loss'] = sch_e_loss
-    lhs_vars['ss_income'] = ss_income
-    lhs_vars['ucomp'] = ucomp
-    lhs_vars['wage1'] = wage1
-    lhs_vars['wage2'] = wage2
-    lhs_vars['wage3'] = wage3
-    lhs_vars['wage4'] = wage4
-    lhs_vars['wage5'] = wage5
-    lhs_vars['wage6'] = wage6
-    lhs_vars['wage7'] = wage7
-    lhs_vars['wage8'] = wage8
-
-    print('Preparing Targets for {}'.format(year))
-    apopn = factors['APOPN'][year]
-    aints = factors['AINTS'][year]
-    adivs = factors['ADIVS'][year]
-    aschci = factors['ASCHCI'][year]
-    aschcl = factors['ASCHCL'][year]
-    acgns = factors['ACGNS'][year]
-    atxpy = factors['ATXPY'][year]
-    aschei = factors['ASCHEI'][year]
-    aschel = factors['ASCHEL'][year]
-    asocsec = factors['ASOCSEC'][year]
-    apopsnr = factors['APOPSNR'][year]
-    aucomp = factors['AUCOMP'][year]
-    awage = factors['AWAGE'][year]
+    def wage_bin(left, right):
+        # Don't use pandas.Series.between since that's only inclusive on
+        # both sides or neither side.
+        return np.where((data.e00100 > left) & (data.e00100 <= right),
+                        data.WAS, 0) * s006
+    
+    wage1 = wage_bin(-np.inf, 10000)
+    wage2 = wage_bin(10000, 20000)
+    wage3 = wage_bin(20000, 30000)
+    wage4 = wage_bin(30000, 40000)
+    wage5 = wage_bin(40000, 50000)
+    wage6 = wage_bin(50000, 75000)
+    wage7 = wage_bin(75000, 100000)
+    wage8 = wage_bin(100000, np.inf)
+
+    LHS_VAR_NAMES = [
+        'single_returns', 'joint_returns', 'hh_returns', 'returns_w_ss',
+        'dep_exemptions', 'interest', 'dividend', 'biz_income', 'biz_loss',
+        'cap_gain', 'pension', 'sch_e_income', 'sch_e_loss', 'ss_income',
+        'ucomp', 'wage1', 'wage2', 'wage3', 'wage4', 'wage5', 'wage6', 'wage7',
+        'wage8']
+    
+    for i in LHS_VAR_NAMES:
+        lhs_vars[i] = globals()[i]
+
+    print('Preparing Targets for {}...'.format(year))
+    FACTOR_VAR_NAMES = [
+        'apopn', 'aints', 'adivs', 'aschci', 'aschcl', 'acgns', 'atxpy',
+        'aschei', 'aschel', 'asocsec', 'apopsnr', 'aucomp', 'awage'
+    ]
+    
+    for i in FACTOR_VAR_NAMES:
+        exec(i + " = factors['" + i.upper() + "'][year]")
 
     year = str(year)
     rhs_vars = {}
@@ -113,52 +93,33 @@ def target(target_val, pop, factor, value):
     target_name = 'Dep_return'
     rhs_vars['dep_exemptions'] = (targets[year][target_name] -
                                   dep_exemptions.sum())
-    rhs_vars['interest'] = target(targets[year]['INTS'], apopn, aints,
-                                  interest.sum())
-    rhs_vars['dividend'] = target(targets[year]['DIVS'], apopn, adivs,
-                                  dividend.sum())
-    rhs_vars['biz_income'] = target(targets[year]['SCHCI'], apopn, aschci,
-                                    biz_income.sum())
-    rhs_vars['biz_loss'] = target(targets[year]['SCHCL'], apopn, aschcl,
-                                  biz_loss.sum())
-    rhs_vars['cap_gain'] = target(targets[year]['CGNS'], apopn, acgns,
-                                  cap_gain.sum())
-    rhs_vars['pension'] = target(targets[year]['Pension'], apopn, atxpy,
-                                 pension.sum())
-    rhs_vars['sch_e_income'] = target(targets[year]['SCHEI'], apopn, aschei,
-                                      sch_e_income.sum())
-    rhs_vars['sch_e_loss'] = target(targets[year]['SCHEL'], apopn, aschel,
-                                    sch_e_loss.sum())
-    rhs_vars['ss_income'] = target(targets[year]['SS'], apopsnr, asocsec,
-                                   ss_income.sum())
-    rhs_vars['ucomp'] = target(targets[year]['UCOMP'], apopn, aucomp,
-                               ucomp.sum())
-    rhs_vars['wage1'] = target(targets[year]['wage1'], apopn, awage,
-                               wage1.sum())
-    rhs_vars['wage2'] = target(targets[year]['wage2'], apopn, awage,
-                               wage2.sum())
-    rhs_vars['wage3'] = target(targets[year]['wage3'], apopn, awage,
-                               wage3.sum())
-    rhs_vars['wage4'] = target(targets[year]['wage4'], apopn, awage,
-                               wage4.sum())
-    rhs_vars['wage5'] = target(targets[year]['wage5'], apopn, awage,
-                               wage5.sum())
-    rhs_vars['wage6'] = target(targets[year]['wage6'], apopn, awage,
-                               wage6.sum())
-    rhs_vars['wage7'] = target(targets[year]['wage7'], apopn, awage,
-                               wage7.sum())
-    rhs_vars['wage8'] = target(targets[year]['wage8'], apopn, awage,
-                               wage8.sum())
+    
+    def add_target(val, col, factor, pop=apopn):
+        rhs_vars[val] = target(targets[year][col], apopn, factor, 
+                               globals()[val].sum())
+    
+    add_target('interest', 'INTS', aints)
+    add_target('dividend', 'DIVS', adivs)
+    add_target('biz_income', 'SCHCI', aschci)
+    add_target('biz_loss', 'SCHCL', aschcl)
+    add_target('cap_gain', 'CGNS', acgns)
+    add_target('pension', 'Pension', atxpy)
+    add_target('sch_e_income', 'SCHEI', aschei)
+    add_target('sch_e_loss', 'SCHEL', aschel)
+    # Social Security income uses a different population variable.
+    add_target('ss_income', 'SS', asocsec, apopsnr)
+    add_target('ucomp', 'UCOMP', aucomp)
+    for i in [1, 2, 3, 4, 5, 6, 7, 8]:
+        add_target('wage' + str(i), 'wage' + str(i), awage)
 
     model_vars = ['single_returns', 'joint_returns', 'returns_w_ss',
                   'dep_exemptions', 'interest', 'dividend', 'biz_income',
                   'pension', 'ss_income', 'ucomp', 'wage1', 'wage2', 'wage3',
                   'wage4', 'wage5', 'wage6', 'wage7', 'wage8']
+    # 2014 data lacks dividend and ucomp.
     if year == '2014':
-        model_vars = ['single_returns', 'joint_returns', 'returns_w_ss',
-                      'dep_exemptions', 'interest', 'biz_income',
-                      'pension', 'ss_income', 'wage1', 'wage2', 'wage3',
-                      'wage4', 'wage5', 'wage6', 'wage7', 'wage8']
+        model_vars.remove('dividend')
+        model_vars.remove('ucomp')
 
     vstack_vars = []
     b = []  # list to hold the targets
@@ -177,22 +138,22 @@ def target(target_val, pop, factor, value):
     a2 = np.array(-one_half_lhs)
 
     # set up LP model
-    print('Constructing LP Model')
-    LP = pulp.LpProblem('CPS Stage 2', pulp.LpMinimize)
+    print('Constructing LP Model...')
+    lp = pulp.LpProblem('CPS Stage 2', pulp.LpMinimize)
     r = pulp.LpVariable.dicts('r', data.index, lowBound=0)
     s = pulp.LpVariable.dicts('s', data.index, lowBound=0)
     # add objective function
-    LP += pulp.lpSum([r[i] + s[i] for i in data.index])
-    # add constrains
+    lp += pulp.lpSum([r[i] + s[i] for i in data.index])
+    # add constraints
     for i in data.index:
-        LP += r[i] + s[i] <= tol
+        lp += r[i] + s[i] <= tol
     for i in range(len(b)):
-        LP += pulp.lpSum([(a1[i][j] * r[j] + a2[i][j] * s[j])
+        lp += pulp.lpSum([(a1[i][j] * r[j] + a2[i][j] * s[j])
                           for j in data.index]) == b[i]
     print('Solving Model...')
     pulp.LpSolverDefault.msg = 1
-    LP.solve()
-    print(pulp.LpStatus[LP.status])
+    lp.solve()
+    print(pulp.LpStatus[lp.status])
 
     # apply r and s to the weights
     r_val = np.array([r[i].varValue for i in r])