Skip to content

Latest commit

 

History

History
101 lines (67 loc) · 2.79 KB

pandas.md

File metadata and controls

101 lines (67 loc) · 2.79 KB

Pandas Basics

 # read a csv file or excel
 df = pd.read_csv('data/survey_results_public.csv', delimiter=',')
 df = pd.read_excel('data/survey_results_public.xls ')

 # structure of data in dataframe
 people = {
 "column1" : ["value1", "value2", "value3"]
 "column2" : ["value4", "value5", "value6"]
 "column3" : ["value7", "value8", "value9"]
 }

 # create dataframe from python dictionary
 df = pd.DataFrame(people) 

 # get the shape of the file
 df.shape

 # set option
 df.set_option('display.max_columns', 89)

 # get top 10 or last 10 rows
 df.head(10),  df.tail(10)


 # get series from data frame (series have more functionality in comp to dataframes)
 df['column1'] or df.columns or df['column1'][0:5]


 # selecting multiple columns (returns the filtered dataframe )
 df[['column1', 'column2']]

 # get the dataframe row(returns series) search by integer location
 df.iloc[0]  

 #example array1=rowsIndex, array2=columnIndex
 df.iloc[[0, 1], 2] or df.iloc[[0, 1], [2,3]]

 # iteration through rows
 for index, rows in df.iterrows():
     print(index, row)

 
 # get high level starts on frame (mean, sd)
 df.describe()
 
 # sort by column
 df.sort_values('Column1', ascending=false)
 df.sort_values(['Column1', 'Column2'], ascending=[1, 0])
 
 # row wise summmation (axis=0 => vertically, axis=1 => Horizontally )
 df['total'] = df['column1'] + df['column2']+df['column3']
 df['total'] = df.iloc[:, c1:c4].sum(axis=1)
 
 # drop a column
 df = df.drop(columns=['total'])
 
 
 
 #  *******: Filtering Data :********** (Loc finding specific data in dataframe)
 df.loc[df["column1"] == "value1"]
 df.loc[ (df["column1"] == "value1") &  (df["column2"] > "value2") ] 
 df.loc[ (df["column1"] == "value1") |  (df["column2"] == "value2") ] 
 df.loc[ ~df["column1"].str.contains('substring')] # gets all columns which donot contains substring
 
 # gets all columns which based on regex 
 df.loc[ ~df["column1"].str.contains('sub1|sub2', flags=re.I, regex=True)] 
 
 
 # Conditional Changes
 df.loc[df.["column1"] == 'val1', "column1"] = "new_val1"
 df.loc[df.["Total"] > 500, ["Generation", "Legendary"]] = ["val1", "val2"]
 
 # Groupby (Aggregation)
 df.groupby(['column1']).mean().sort_values('Col', ascending=False)
 df.groupby(['column1']).sum()
 
 df['count'] = 1
 df.groupby(['column1', 'column2']).count()['count']
 
   
 #Reset Index
 df.reset_index(drop=true, inplace=true)
 
 # save df to directory
 df.to_csv('path', index=false)
 df.to_excel('path', index=false)
 df.to_csv('path', index=false, sep='\t')
 
 # work with large amount of data
 new_df = pd.DateFrames(columns=df.columns) 
 
 for df in pd.read_csv('filename.csv', chunksize=100000)
   results = df.groupby(['Columns1']).count()
   
   new_df = pd.concat([new_df, results])