გამოიყენება მონაცემების ანალიზისა და მანიპულაციისთვის.
from google.colab import drive
drive.mount('/content/drive')
import pandas as pd
df = pd.read_excel("path.xlsx")
import pandas as pd
from google.colab import drive
drive.mount('/content/drive')
file_path = "path.xlsx"
sheet_name = 'sheet_name'
df = pd.read_excel(file_path, sheet_name=sheet_name)
import pandas as pd
import pyodbc
conn = pyodbc.connect('Driver={SQL Server};'
'Server=10.0.0.96\SQL2019;'
'Database=TERA_TEST;'
'Trusted_Connection=yes;')
employee = pd.read_sql_query('SELECT * FROM [TERA_TEST].[dbo].[TERA TEST$Absence Line$437]',conn)
Info
print ("Rows : " ,df.shape[0])
print ("Columns : " ,df.shape[1])
print ("\nFeatures : \n" ,df.columns.tolist())
print ("\nMissing values : ", df.isnull().sum().values.sum())
print ("\nUnique values : \n",df.nunique())
Select Columns
df.get(['Column_A', 'Column_B', 'Column_C'])
df=df.iloc[ : , lambda df : [13 , 5]]
data = df.iloc[:, lambda df: slice(1, 34)]
df=df.iloc[: , 0: 8 ]
select_columns = ['column_A','column_B','column_C']
df= df[select_columns]
df.select_dtypes(include='object')
df.select_dtypes(include=['int64', 'float64'])
Select Rows
df.iloc[[1]]
df[0,1]
df.iloc[:2]
drop
df = df.dropna(how='all') - drop all blank rows
df = df.dropna() --- drop blank if at least one is blank
join
df= pd.merge(df1, df2, left_index=True, right_index=True)
df= pd.merge(df1,df2,on='len',how='inner')
Filter & ContainString
df=df[df['Tran_Date'] < '2022-04-01']
df=df[df['Name'] == “მიხეილი“ ]
df=df[df['Name'].str.contains(“მიხეილი“ )]
df=df[df['SurName'].str.contains(“გუბელიძე“ ) & df['SurName'].str.contains(“მიხეილ“ ) ]
Split And Concatenate
df[['Col_1', 'Col_2']]=df.iloc[:,6].str.split( '-' , expand = True)
df["d"] = df['yy'].astype(str)+ "-" + df["mm"].astype(str)+"-1"
Extract Date
df1['year']= df['TransactionDate'].dt.year
df1['month']= df['TransactionDate'].dt.month
df1['mm']= df['TransactionDate'].dt.month_name()
df['mmyy'] = df['Date'].apply(lambda x: x.strftime('%b-%Y'))
Distinct / CountRows
df[‘year’].unique()
dict(enumerate(df[‘Station’].unique()))
df['columnName'].value_counts(sort = False)
Order Columns /Order By
new_order = [4,1,0,2,3,]
df = df[df.columns[new_order]]
df.sort_values ( by=’date’ ,ascending =False)
Where Condition
df[‘NewCol’] = np.where(df['Code'] == ‘lanch’ ,’ლანჩხუთი’,
np.where(df['Code'] == ‘Chokh’ ,’ჩოხატაური’,
‘ოზურგეთი’ )
If Statement
df[‘age_group’] = df[‘age’].apply(lambda x: ‘[17, 25)’
if x < 25 else ‘[25, 35)’
if x < 35 else ‘[35, 45)’
if x < 65 else ‘65+’)
df['quality'] = df['quality'].apply(lambda x: 1 if x >= 7 else 0)
isnull / filna
df.isnull().sum()
df[df.isnull().any(axis=1)]
df['columnname'].fillna(method='ffill')
df['columnname'].fillna(method='backfill')
df['columnname'].fillna(df['columnname'].median(), inplace=True)
Rename . Rename With Index
new_columns = {'ano' : 'year',estado': 'state'}
df.rename(columns = new_columns, inplace=True)
column_index_to_rename = 0
current_column_name = df.columns[column_index_to_rename]
new_column_name = 'YYMM'
column_ren_dict = {current_column_name: new_column_name}
date_range_default.rename(columns=column_ren_dict, inplace=True)
Group By / Pivot
df=df[['year','month','AMOUNT']].groupby(by=['year','month']).sum()
grouped_df = df.groupby('Category').size().reset_index(name='Count')
result = df.groupby(['date', 'customer']).agg({'Value_column': 'sum', 'Type': 'count'}).rename(columns={'customer': 'Count'}).reset_index()
df.pivot_table(index='col1', columns='col2',values='Quantity',aggfunc='sum')
Rank / Rank Window
df['Rank'] = df['Population'].rank(method='dense', ascending=False)
df['Rank_Mun']=df['Popul'].groupby(df['municipality']).cumcount()
Rename . Rename With Index
new_columns = {'ano' : 'year',estado': 'state'}
df.rename(columns = new_columns, inplace=True)
column_index_to_rename = 0
current_column_name = df.columns[column_index_to_rename]
new_column_name = 'YYMM'
column_ren_dict = {current_column_name: new_column_name}
date_range_default.rename(columns=column_ren_dict, inplace=True)
IF Statement
df['quality'] = [1 if x > 5 else 0 for x in df.quality]
df['type'] = [1 if x == "white" else 0 for x in df.type]
DataFrame/ Len / Reshape/Convert
X = pd.DataFrame(X)con
df['len_dscrpt'] = df['Desc_2'].apply(lambda x: len(str(x)))
x_train_reshaped = np.array(x_train).reshape(-1, 1)
df['date']=pd.to_datetime(df['date'])
df['date'] = df['date'].dt.date
df = df.convert_dtypes()
df['namr'] = pd.to_numeric(df['name'], errors='coerce')
df = df.astype({'Fee':'int'})
df['Fee'] = df['Fee'].astype('int')
Download
from google.colab import files
file_name = 'guria.xlsx'
guria.to_excel(file_name, index=False)
files.download(file_name)
from google.colab import files
df.to_excel('df.xlsx',index=False , encoding='utf-8')
files.download('act_prd.xlsx')