Source code for renewenergy.clean_data

import click
from io import BytesIO
import pandas as pd
import os
from zipfile import ZipFile
from urllib.request import urlopen
from sklearn.model_selection import train_test_split
import numpy as np
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from renewenergy.impute_split import impute_split


[docs]
def clean_data(dataread,dataout,datafile1, datafile2, seed):
  """
    Perform all cleaning steps on the dataset 

    Parameters
    ----------
    dataread: str
        Path to dataset
    
    dataout: str
        Path to save training and testing datasets to. 
    
    datafile1: str
        Name of CSV file to save test data to. 
    
    datafile2: str
        Name of CSV file to save training data to. 

    seed: int
        Used to allow for reproduceability of results.
    
    
    Returns
    -------
    training.csv
        CSV containing the training data 
    
    test.csv
        CSV containing the test data 
      

    Examples
    --------
    >>> plot_rmse("data/energy_train.csv", "data/energy_test.csv", "results/" )
    
  """
  np.random.seed(seed)
  data1 = pd.read_csv(dataread)
  data1=data1.pivot_table(index= 'Country Name', values="2015", columns='Indicator Name')
  data1= data1[['Access to electricity (% of population)', 'Adjusted net national income (constant 2015 US$)', 
             'CO2 emissions (kt)', 'Death rate, crude (per 1,000 people)',
              'Land area (sq. km)', 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
                'Population, total','Renewable energy consumption (% of total final energy consumption)',
             'Renewable electricity output (% of total electricity output)']]
  energy_train, energy_test = impute_split(data1, 0, 0.75, seed)
  os.makedirs(dataout, exist_ok=True)  
  energy_test.to_csv(dataout+"/"+datafile1)  
  energy_train.to_csv(dataout+ "/"+datafile2)