Karthik K

Karthik K

  • 1.3k
  • 479
  • 50.1k

How to fix Unresolved import 'src.transform' in python ?

Sep 6 2023 2:53 AM
# transform.py

from datetime import date
from html import unescape

import pandas as pd
from bs4 import BeautifulSoup

class Transform:
    def __init__(self, df, config):
        self.df = df
        self.config = config

    def save(self, output_path):
        """
        Description:
            save the cleaned df into csv
        args:
            output_path: path to output.csv
        """
        self.df.to_csv(output_path, index=False)

    def html_format_remover(self):
        """
        Description:
            remove html entities/character references in COLUMNS_TO_REMOVE_HTML(config) columns
        """
        for col in self.config["COLUMNS_REMOVE_HTML"]:
            self.df[col] = (
                      self.df[col].apply(
                          lambda x: BeautifulSoup(
                              unescape(x), "html.parser"
                          ).text
                          if pd.notnull(x)
                          else x
                      )
                )
        return self.df

    def reformat_string(self):
        """
        Description:
            convert to lowercase and remove all special characters in all COLUMN_STR_TYPES(config) columns
            remove digits in product_name and product_description
        """
        for col in self.config["COLUMN_STR_TYPES"]:
            self.df[col] = self.df[col].astype(str)

            self.df[col] = self.df[col].apply(
                lambda x: x.lower()
            )

            if not col == "sku":
                self.df[col] = self.df[col].str.replace(
                        r"[^\w\s]|_",
                        "",
                        regex=True
                )

            if col == "product_name" or col == "product_description":
                self.df[col] = self.df[col].replace(
                        r'[\d]',
                        '',
                        regex=True
                    )
        return self.df

    def keep_unit_price_qyt_more_than_zero(self):
        """
        Description:
            Keep rows that have COLUMNS_KEEP_MORETHANZERO(config) greater than 0
        """
        for col in self.config["COLUMNS_KEEP_MORETHANZERO"]:
            self.df[col].fillna(0, inplace=True)
            self.df = self.df[self.df[col] > 0]
        return self.df

    def age_group(self):
        """
        Description:
            add age_group column that is derived from COLUMN_CONVERT_AGE(config) column
        """
        year = date.today().year

        yob = self.config["COLUMN_CONVERT_AGE"][0]
        self.df['age'] = self.df[yob].apply(
            lambda x: x
            if (pd.isnull(x) or x == '')
            else year-int(x)
        )
        self.df['age'] = pd.to_numeric(self.df['age'])
        self.df['age_group'] = pd.cut(
            self.df['age'],
            bins=self.config["AGE_BINS"],
            labels=self.config["AGE_LABELS"],
            right=False,
            include_lowest=True
        )
        self.df = self.df.drop(columns=['age'])
        return self.df

    def region(self):
        """
        Description:
            add region column that is derived from COLUMN_CONVERT_POSTAL(config) column
        """
        postal_sector = self.config["COLUMN_CONVERT_POSTAL"][0]
        self.df[postal_sector] = self.df[postal_sector].astype(str)
        self.df['region'] = self.df[postal_sector].apply(
            lambda x: x[:2]
            if not ((pd.isnull(x) or x == '') and len(x) == 2)
            else (x[:1]
                  if not ((pd.isnull(x) or x == '') and len(x) == 1)
                  else ''
                  )
        )
        DICT_REGION = {
            'north-east': [
                "53", "54", "55", "56", "57",
                "79",
                "80", "82"
            ],
            'north': [
                "69",
                "70", "71", "72", "73", "75", "76"
            ],
            'central': [
                "1", "2", "3", "4", "5", "6", "7", "8", "9",
                "01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
                "14", "15", "16", "17", "18", "19",
                "20", "21", "22", "23", "24", "25", "26", "27", "28", "29",
                "30", "31", "32", "33", "34", "35", "36", "37", "38", "39",
                "40", "41",
                "58", "59",
                "77", "78"
            ],
            'west': [
                "11", "12", "13",
                "60", "61", "62", "63", "64", "65", "66", "67", "68"
            ],
            'east': [
                "42", "43", "44", "45", "46", "47", "48", "49",
                "50", "51", "52",
                "81"
            ]
        }
        mapping = {}
        for key, values in DICT_REGION.items():
            for item in values:
                mapping[item] = key
        self.df['region'] = self.df['region'].replace(mapping)
        self.df['region'] = self.df['region'].apply(
            lambda x: x
            if (
                    x == 'north-east' or
                    x == 'north' or
                    x == 'central' or
                    x == 'west' or
                    x == 'east'
            )
            else ""
        )
        return self.df

    def remove_spacing_from_sku(self):
        """
        Description:
            substitute COLUMN_SKU(config) spacing with "_"
            substitute COLUMN_SKU(config) starting with 0 with string "A"
        """
        sku = self.config["COLUMN_SKU"][0]
        self.df[sku] = self.df[sku].replace(
            r'[ ]',
            '_',
            regex=True
        )
        self.df[sku] = self.df[sku].apply(
            lambda x: "A" + x[1:]
            if x[0] == "0"
            else x
        )
        return self.df

    def combine_storeID_outlet_to_SKU(self):
        """
        Description:
            Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_SKU(config) if same COLUMN_SKU(config), different stores
        """
        sku = self.config["COLUMN_SKU"][0]
        sid = self.config["COLUMN_STOREID"][0]
        outlet = self.config["COLUMN_OUTLET"][0]
        self.df[sku] = self.df[sid] + "_" + self.df[outlet] + "_" + self.df[sku]
        return self.df

    def combine_storeID_outlet_to_customerID(self):
        """
        Description:
            Appends COLUMN_STOREID(config) + COLUMN_OUTLET(config) to COLUMN_CUSTID(config) if same COLUMN_CUSTID(config), different stores
        """
        cid = self.config["COLUMN_CUSTID"][0]
        sid = self.config["COLUMN_STOREID"][0]
        outlet = self.config["COLUMN_OUTLET"][0]
        self.df[cid] = self.df[sid].astype(str) + "_" + self.df[outlet].astype(str)  + "_" + self.df[cid].astype(str)
        return self.df

    def remove_null_compulsory_data(self):
        """
        Description:
            Removes rows where data is null but schema does not allow nullable
            and is compulsory as per COLUMNS_NULL_COMPULSORY(config)
        """
        for col in self.config["COLUMNS_NULL_COMPULSORY"]:
            self.df.drop(
                self.df[
                    self.df[col].map(len) < 1
                ].index,
                inplace=True
            )

            self.df.drop(
                self.df[
                    self.df[col] == "nan"
                ].index,
                inplace=True
            )
            self.df.drop(
                self.df[
                    self.df[col] == " "
                ].index,
                inplace=True
            )
        return self.df

Hi All , 

I having warning in this code . can anyone check and can give some solution for ETL code. 

Thanks in advance 

karthik


Answers (1)