Data Cleaning🧹

Load the Packages

#  Load Required Packages
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
library(rnaturalearth)

Import the Data

#  Import the Raw Olympics Dataset
olympics_dataset <- read_csv('data/olympics_dataset.csv')
Rows: 252565 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (9): Name, Sex, Team, NOC, Season, City, Sport, Event, Medal
dbl (2): player_id, Year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the Data

#  Explore the Structure of the Data
head(olympics_dataset)    # Preview the first few rows
# A tibble: 6 × 11
  player_id Name          Sex   Team  NOC    Year Season City  Sport Event Medal
      <dbl> <chr>         <chr> <chr> <chr> <dbl> <chr>  <chr> <chr> <chr> <chr>
1         0 A Dijiang     M     China CHN    1992 Summer Barc… Bask… Bask… No m…
2         1 A Lamusi      M     China CHN    2012 Summer Lond… Judo  Judo… No m…
3         2 Gunnar Aaby   M     Denm… DEN    1920 Summer Antw… Foot… Foot… No m…
4         3 Edgar Aabye   M     Denm… DEN    1900 Summer Paris Tug-… Tug-… Gold 
5        26 Cornelia (-s… F     Neth… NED    1932 Summer Los … Athl… Athl… No m…
6        27 Cornelia (-s… F     Neth… NED    1932 Summer Los … Athl… Athl… No m…
str(olympics_dataset)     # Check the structure of each column
spc_tbl_ [252,565 × 11] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
 $ player_id: num [1:252565] 0 1 2 3 26 27 29 31 32 33 ...
 $ Name     : chr [1:252565] "A Dijiang" "A Lamusi" "Gunnar Aaby" "Edgar Aabye" ...
 $ Sex      : chr [1:252565] "M" "M" "M" "M" ...
 $ Team     : chr [1:252565] "China" "China" "Denmark" "Denmark/Sweden" ...
 $ NOC      : chr [1:252565] "CHN" "CHN" "DEN" "DEN" ...
 $ Year     : num [1:252565] 1992 2012 1920 1900 1932 ...
 $ Season   : chr [1:252565] "Summer" "Summer" "Summer" "Summer" ...
 $ City     : chr [1:252565] "Barcelona" "London" "Antwerpen" "Paris" ...
 $ Sport    : chr [1:252565] "Basketball" "Judo" "Football" "Tug-Of-War" ...
 $ Event    : chr [1:252565] "Basketball Men's Basketball" "Judo Men's Extra-Lightweight" "Football Men's Football" "Tug-Of-War Men's Tug-Of-War" ...
 $ Medal    : chr [1:252565] "No medal" "No medal" "No medal" "Gold" ...
 - attr(*, "spec")=
  .. cols(
  ..   player_id = col_double(),
  ..   Name = col_character(),
  ..   Sex = col_character(),
  ..   Team = col_character(),
  ..   NOC = col_character(),
  ..   Year = col_double(),
  ..   Season = col_character(),
  ..   City = col_character(),
  ..   Sport = col_character(),
  ..   Event = col_character(),
  ..   Medal = col_character()
  .. )
 - attr(*, "problems")=<externalptr> 
glimpse(olympics_dataset)  # Quick overview of the dataset
Rows: 252,565
Columns: 11
$ player_id <dbl> 0, 1, 2, 3, 26, 27, 29, 31, 32, 33, 35, 36, 37, 38, 39, 41, …
$ Name      <chr> "A Dijiang", "A Lamusi", "Gunnar Aaby", "Edgar Aabye", "Corn…
$ Sex       <chr> "M", "M", "M", "M", "F", "F", "M", "M", "F", "F", "M", "M", …
$ Team      <chr> "China", "China", "Denmark", "Denmark/Sweden", "Netherlands"…
$ NOC       <chr> "CHN", "CHN", "DEN", "DEN", "NED", "NED", "FIN", "FIN", "FIN…
$ Year      <dbl> 1992, 2012, 1920, 1900, 1932, 1932, 1952, 2000, 1996, 2000, …
$ Season    <chr> "Summer", "Summer", "Summer", "Summer", "Summer", "Summer", …
$ City      <chr> "Barcelona", "London", "Antwerpen", "Paris", "Los Angeles", …
$ Sport     <chr> "Basketball", "Judo", "Football", "Tug-Of-War", "Athletics",…
$ Event     <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightweight…
$ Medal     <chr> "No medal", "No medal", "No medal", "Gold", "No medal", "No …
colnames(olympics_dataset)    # List all column names
 [1] "player_id" "Name"      "Sex"       "Team"      "NOC"       "Year"     
 [7] "Season"    "City"      "Sport"     "Event"     "Medal"    

Data Cleaning

# Rename columns to be more standardize & consistent (lower case)
# Select only the relevant columns for analysis
# Convert data types (e.g., year to integer, categories to factors)


olympics_dfclean <- olympics_dataset |>
  rename(
    athlete_name = Name,
    sex = Sex,
    country = Team,
    year = Year,
    sport = Sport,
    medal = Medal,
    event = Event) |>
  mutate(
    year = as.integer(year),
    sex = as.factor(sex),
    country = as.factor(country),
    sport = as.factor(sport),
    medal = as.factor(medal)) |>
  
# Keep only the needed columns, including 'event'
  select(athlete_name, year, sex, country, sport, medal, event)

# Check for missing values
 olympics_dfclean |> 
  is.na() |> 
  sum()
[1] 0
# Preview the cleaned dataset
glimpse(olympics_dfclean)
Rows: 252,565
Columns: 7
$ athlete_name <chr> "A Dijiang", "A Lamusi", "Gunnar Aaby", "Edgar Aabye", "C…
$ year         <int> 1992, 2012, 1920, 1900, 1932, 1932, 1952, 2000, 1996, 200…
$ sex          <fct> M, M, M, M, F, F, M, M, F, F, M, M, M, M, M, M, M, M, M, …
$ country      <fct> "China", "China", "Denmark", "Denmark/Sweden", "Netherlan…
$ sport        <fct> "Basketball", "Judo", "Football", "Tug-Of-War", "Athletic…
$ medal        <fct> No medal, No medal, No medal, Gold, No medal, No medal, N…
$ event        <chr> "Basketball Men's Basketball", "Judo Men's Extra-Lightwei…
head(olympics_dfclean)
# A tibble: 6 × 7
  athlete_name           year sex   country        sport      medal    event    
  <chr>                 <int> <fct> <fct>          <fct>      <fct>    <chr>    
1 A Dijiang              1992 M     China          Basketball No medal Basketba…
2 A Lamusi               2012 M     China          Judo       No medal Judo Men…
3 Gunnar Aaby            1920 M     Denmark        Football   No medal Football…
4 Edgar Aabye            1900 M     Denmark/Sweden Tug-Of-War Gold     Tug-Of-W…
5 Cornelia (-strannood)  1932 F     Netherlands    Athletics  No medal Athletic…
6 Cornelia (-strannood)  1932 F     Netherlands    Athletics  No medal Athletic…

Save Cleaned Data

# Save as CSV
write_csv(olympics_dfclean, "data/olympics_dfclean.csv") 

# Save as .RData
save(olympics_dfclean, file = "data/olympics_dfclean.RData")