-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFPDS Atom feed data pull.py
122 lines (106 loc) · 5.62 KB
/
FPDS Atom feed data pull.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 14 18:44:49 2020
@author: CatsAndProcurement
The purpose of the script is to
download a user-defined dataset from the
Federal Procurement Data System (FPDS)
and aggregate and display the data using Pandas.
FPDS is a publicly accessible US government data system
for prime contract award information.
Sample FPDS web API call:
https://www.fpds.gov/ezsearch/fpdsportal?s=FPDS&indexName=awardfull&templateName=CSV&q=++CONTRACT_FISCAL_YEAR%3A%222019%22+DEPARTMENT_ID%3A%224700%22+PRODUCT_OR_SERVICE_CODE%3A%22D3*%22
"""
# Pandas lets us do fancy calculations:
import pandas as pd
# We'll use this to write a paragraph later:
writePara = ""
# The following are parameters we'll use for the FPDS API call
# '4700' is the API call code for the General Services Administration (GSA):
callDept = "4700"
# 2019 is the fiscal year that we'll pull data for:
callFY = "2019"
# Product and Service Codes (PSCs) tell us what the government is buying:
askPSC = input("Hi! This Python script provides info on General Services"+
"Administration (GSA) contracting!"+"\n"+
"Please enter a 2-digit Product Service Code (PSC)."+"\n"+
"Suggestions include 'D3' for IT services,"+
"'S2' for Housekeeping services,"+
"see here for others:"+"\n"+
"https://www.acquisition.gov/PSC_Manual"+"\n")
# FPDS uses asterisks as wildcards, so the following lets us pull all PSC data:
callPSC = askPSC + "*"
# The following assembles our API call as a string:
callURL = ("https://www.fpds.gov/ezsearch/fpdsportal?s=FPDS&indexName=awardfull&templateName=CSV&q=++"+
"CONTRACT_FISCAL_YEAR%3A%22"+callFY+
"%22+DEPARTMENT_ID%3A%22"+callDept+
"%22+PRODUCT_OR_SERVICE_CODE%3A%22"+callPSC+
"%22")
# The following lets the user know the program is pulling their data:
print("\n"+
"Now accessing:"+"\n"+
callURL+
"\n")
# The following extracts the data from the FPDS web site as a CSV file:
dfFPDS = pd.read_csv(callURL,encoding="latin-1")
# A contract 'obligation' is the point at which a US govt contracting officer
# agrees that the USA will pay money on a contract
# The following cleans up potentially error-causing contract obligation data:
dfFPDS["Action Obligation ($)"] = dfFPDS["Action Obligation ($)"].str.replace("$","")
dfFPDS["Action Obligation ($)"] = dfFPDS["Action Obligation ($)"].str.replace(",","")
dfFPDS["Action Obligation ($)"] = dfFPDS["Action Obligation ($)"].astype(float)
# The following modifies PSC data to make it more readable:
dfFPDS["PSC Description"] = dfFPDS["PSC Description"].str.lower()
# The following creates a new variable with both PSC and a description of
# what that PSC means:
dfFPDS["Code and Description"] = (dfFPDS["PSC"].apply(str)+" ("+dfFPDS["PSC Description"]+")")
# The following deletes hyphens and capitalizes the phrases 'IT' & 'R&D' if needed:
dfFPDS["Code and Description"] = dfFPDS["Code and Description"].str.replace("- "," ")
dfFPDS["Code and Description"] = dfFPDS["Code and Description"].str.replace("it ","IT ")
dfFPDS["Code and Description"] = dfFPDS["Code and Description"].str.replace("r&d ","R&D ")
# The following uses Pandas to create a pivot table and add up the obligations
# based on our new 'Code and Description' variable:
dfPivot = dfFPDS.pivot_table(index=["Code and Description"],
values=["Action Obligation ($)"],
aggfunc=sum,
fill_value=0,
margins="true")
# The following sorts the data high to low:
dfPivot = dfPivot.reindex(dfPivot["Action Obligation ($)"].sort_values(ascending=False).index).reset_index()
# The following adds an introduction sentence to our paragraph:
writePara=(writePara+
"The following data represents real-time information on US General Services "+
"Administration (GSA) contract obligations for fiscal year "+callFY+
" in Product Service Code (PSC) category "+askPSC+". ")
# The following figures out how many times we ought to loop through the pivot
# table to write sentences:
numPSCs = len(dfPivot.index)-1
iCounter = min(numPSCs-1,5)
# The following reads the top few aggregated values in the pivot table,
# also finds their PSC code/descriptions, puts them in sentences, and
# puts the sentences in a paragraph:
for i in range(1,iCounter):
writeSent=("GSA obligated "+
"${:,.0f}".format(round(dfPivot.iloc[i,1]))+
" on contracts coded under PSC "+
dfPivot.iloc[i,0]+".")
writePara=writePara+writeSent
i=i+1
# The following adds a final sentence to the paragraph, letting the user know
# how much the agency obligated in total for all PSCs in the dataset:
writeSent = ("In total, GSA obligated "+
"${:,.0f}".format(round(dfPivot.iloc[0,1]))+
" on contracts listed under PSC category "+askPSC+".")
# The following adds that last sentence to the paragraph:
writePara=writePara+writeSent
# The following prints the paragraph:
print(writePara)
# Now we'll create a basic chart in Pandas
# The following deletes the first row of the dataframe (the sum total):
dfPivot=dfPivot.drop(dfPivot.index[0])
# And the following displays the entire pivot table, minus that first row
# we deleted, as a bar graph:
dfPivot.plot.bar(x="Code and Description",
y="Action Obligation ($)",
title=("GSA contract obligations in PSC category "+askPSC+" for fiscal year "+callFY),
figsize=(10,6))