In: Computer Science
Submit a processed dataset and Python or SAS script that has been used along with a short description of the steps you have been following.
Link for the unprocessed dataset: https://drive.google.com/file/d/1HQALy5DdsuT8jBNUNQd8Eq0KHgR3JnyG/view?usp=sharing
#python script for processing the data
import pandas as pd
import numpy as np
from functools import reduce
#Reading the dataset usingf pandas
df = pd.read_csv('Datasets\BL-Flickr-Images-Book.csv')
df.head()
#checking to see how many rows contian null values
print df['Edition Statement']
print df['Edition Statement'].isnull()
#Dropping coloumns that do not contain any information
to_drop = ['Edition Statement',
'Corporate Author',
'Corporate Contributors',
'Former owner',
'Engraver',
'Contributors',
'Issuance type',
'Shelfmarks']
df.drop(to_drop, inplace = True, axis = 1)
#We can use the DataFrame.info() method to give us some high
level information about our dataframe, including its size,
information about data types and memory usage.
df.info(memory_usage='deep')
#Setting up an unique identifier for each record instead of
serial number
df = df.set_index('Identifier')
#Cleaning columns using the .apply function
#cleaning the data
#removing unwanted character from the date of publication
column
unwanted_characters = ['[', ',', '-'] # removing the unwanted characters
def clean_dates(item):
dop= str(item.loc['Date of Publication'])
if dop == 'nan' or dop[0] == '[':
return np.NaN
for character in unwanted_characters:
if character in dop:
character_index = dop.find(character)
dop = dop[:character_index]
return dop
df['Date of Publication'] = df.apply(clean_dates, axis = 1)
#Cleaning the title column
def clean_title(title):
if title == 'nan':
return 'NaN'
if title[0] == '[':
title = title[1: title.find(']')]
if 'by' in title:
title = title[:title.find('by')]
elif 'By' in title:
title = title[:title.find('By')]
if '[' in title:
title = title[:title.find('[')]
title = title[:-2]
title = list(map(str.capitalize, title.split()))
return ' '.join(title)
df['Title'] = df['Title'].apply(clean_title)
#saving the processed dataframe into a csv
export_csv = df.to_csv ('processed_dataset.csv', index = None,
header=True)
processed dataset: https://drive.google.com/file/d/1h10lnhShmMIquQl-MN25KrnmKPdwCak2/view?usp=sharing