import click
from io import BytesIO
import pandas as pd
import os
from zipfile import ZipFile
from urllib.request import urlopen
from sklearn.model_selection import train_test_split
import numpy as np
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from renewenergy.impute_split import impute_split
[docs]
def clean_data(dataread,dataout,datafile1, datafile2, seed):
"""
Perform all cleaning steps on the dataset
Parameters
----------
dataread: str
Path to dataset
dataout: str
Path to save training and testing datasets to.
datafile1: str
Name of CSV file to save test data to.
datafile2: str
Name of CSV file to save training data to.
seed: int
Used to allow for reproduceability of results.
Returns
-------
training.csv
CSV containing the training data
test.csv
CSV containing the test data
Examples
--------
>>> plot_rmse("data/energy_train.csv", "data/energy_test.csv", "results/" )
"""
np.random.seed(seed)
data1 = pd.read_csv(dataread)
data1=data1.pivot_table(index= 'Country Name', values="2015", columns='Indicator Name')
data1= data1[['Access to electricity (% of population)', 'Adjusted net national income (constant 2015 US$)',
'CO2 emissions (kt)', 'Death rate, crude (per 1,000 people)',
'Land area (sq. km)', 'PM2.5 air pollution, mean annual exposure (micrograms per cubic meter)',
'Population, total','Renewable energy consumption (% of total final energy consumption)',
'Renewable electricity output (% of total electricity output)']]
energy_train, energy_test = impute_split(data1, 0, 0.75, seed)
os.makedirs(dataout, exist_ok=True)
energy_test.to_csv(dataout+"/"+datafile1)
energy_train.to_csv(dataout+ "/"+datafile2)