# transform.py from datetime import date from html import unescape import pandas as pd from bs4 import BeautifulSoup class Transform: def __init__(self, df, config): self.df = df self.config = config def save(self, output_path): """ Description: save the cleaned df into csv args: output_path: path to output.csv """ self.df.to_csv(output_path, index=False) def html_format_remover(self): """ Description: remove html entities/character references in COLUMNS_TO_REMOVE_HTML(config) columns """ for col in self.config["COLUMNS_REMOVE_HTML"]: self.df[col] = ( self.df[col].apply( lambda x: BeautifulSoup( unescape(x), "html.parser" ).text if pd.notnull(x) else x ) ) return self.df def reformat_string(self): """ Description: convert to lowercase and remove all special characters in all COLUMN_STR_TYPES(config) columns remove digits in product_name and product_description """ for col in self.config["COLUMN_STR_TYPES"]: self.df[col] = self.df[col].astype(str) self.df[col] = self.df[col].apply( lambda x: x.lower() ) if not col == "sku": self.df[col] = self.df[col].str.replace( r"[^\w\s]|_", "", regex=True ) if col == "product_name" or col == "product_description": self.df[col] = self.df[col].replace( r'[\d]', '', regex=True ) return self.df def keep_unit_price_qyt_more_than_zero(self): """ Description: Keep rows that have COLUMNS_KEEP_MORETHANZERO(config) greater than 0 """ for col in self.config["COLUMNS_KEEP_MORETHANZERO"]: self.df[col].fillna(0, inplace=True) self.df = self.df[self.df[col] > 0] return self.df def age_group(self): """ Description: add age_group column that is derived from COLUMN_CONVERT_AGE(config) column """ year = date.today().year yob = self.config["COLUMN_CONVERT_AGE"][0] self.df['age'] = self.df[yob].apply( lambda x: x if (pd.isnull(x) or x == '') else year-int(x) ) self.df['age'] = pd.to_numeric(self.df['age']) self.df['age_group'] = pd.cut( self.df['age'], bins=self.config["AGE_BINS"], labels=self.config["AGE_LABELS"], right=False, include_lowest=True ) self.df = self.df.drop(columns=['age']) return self.df def region(self): """ Description: add region column that is derived from COLUMN_CONVERT_POSTAL(config) column """ postal_sector = self.config["COLUMN_CONVERT_POSTAL"][0] self.df[postal_sector] = self.df[postal_sector].astype(str) self.df['region'] = self.df[postal_sector].apply( lambda x: x[:2] if not ((pd.isnull(x) or x == '') and len(x) == 2) else (x[:1] if not ((pd.isnull(x) or x == '') and len(x) == 1) else '' ) ) DICT_REGION = { 'north-east': [ "53", "54", "55", "56", "57", "79", "80", "82" ], 'north': [ "69", "70", "71", "72", "73", "75", "76" ], 'central': [ "1", "2", "3", "4", "5", "6", "7", "8", "9", "01", "02", "03", "04", "05", "06", "07", "08", "09", "10", "14", "15", "16", "17", "18", "19", "20", "21", "22", "23", "24", "25", "26", "27", "28", "29", "30", "31", "32", "33", "34", "35", "36", "37", "38", "39", "40", "41", "58", "59", "77", "78" ], 'west': [ "11", "12", "13", "60", "61", "62", "63", "64", "65", "66", "67", "68" ], 'east': [ "42", "43", "44", "45", "46", "47", "48", "49", "50", "51", "52", "81" ] } mapping = {} for key, values in DICT_REGION.items(): for item in values: mapping[item] = key self.df['region'] = self.df['region'].replace(mapping) self.df['region'] = self.df['region'].apply( lambda x: x if ( x == 'north-east' or x == 'north' or x == 'central' or x == 'west' or x == 'east' ) else "" ) return self.df def remove_spacing_from_sku(self): """ Description: substitute COLUMN_SKU(config) spacing with "_" substitute COLUMN_SKU(config) starting with 0 with string "A" """ sku = self.config["COLUMN_SKU"][0] self.df[sku] = self.df[sku].replace( r'[ ]', '_', regex=True ) self.df[sku] = self.df[sku].apply( lambda x: "A" + x[1:] if x[0] == "0" else x ) return self.df def combine_storeID_outlet_to_SKU(self): """ Description: Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_SKU(config) if same COLUMN_SKU(config), different stores """ sku = self.config["COLUMN_SKU"][0] sid = self.config["COLUMN_STOREID"][0] outlet = self.config["COLUMN_OUTLET"][0] self.df[sku] = self.df[sid] + "_" + self.df[outlet] + "_" + self.df[sku] return self.df def combine_storeID_outlet_to_customerID(self): """ Description: Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_CUSTID(config) if same COLUMN_CUSTID(config), different stores """ cid = self.config["COLUMN_CUSTID"][0] sid = self.config["COLUMN_STOREID"][0] outlet = self.config["COLUMN_OUTLET"][0] self.df[cid] = self.df[sid].astype(str) + "_" + self.df[outlet].astype(str) + "_" + self.df[cid].astype(str) return self.df def remove_null_compulsory_data(self): """ Description: Removes rows where data is null but schema does not allow nullable and is compulsory as per COLUMNS_NULL_COMPULSORY(config) """ for col in self.config["COLUMNS_NULL_COMPULSORY"]: self.df.drop( self.df[ self.df[col].map(len) < 1 ].index, inplace=True ) self.df.drop( self.df[ self.df[col] == "nan" ].index, inplace=True ) self.df.drop( self.df[ self.df[col] == " " ].index, inplace=True ) return self.df
Hi All ,
I having warning in this code . can anyone check and can give some solution for ETL code.
Thanks in advance
karthik