Cleaning

Set Up

library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load the Data

df <- read_csv("data/child_protection.csv")
Rows: 3275 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): System, Subject, Category, Item
dbl (2): Year, No. of Cases

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Explore the Data

First Few Rows

head(df)
# A tibble: 6 × 6
  System Subject                             Category Item   Year `No. of Cases`
  <chr>  <chr>                               <chr>    <chr> <dbl>          <dbl>
1 CPR    Newly Registered Child Protection … Types o… Phys…  2005            413
2 CPR    Newly Registered Child Protection … Types o… Negl…  2005             41
3 CPR    Newly Registered Child Protection … Types o… Sexu…  2005            234
4 CPR    Newly Registered Child Protection … Types o… Psyc…  2005             23
5 CPR    Newly Registered Child Protection … Types o… Mult…  2005             52
6 CPR    Newly Registered Child Protection … Sex of … Fema…  2005            462

Column Details

glimpse(df)
Rows: 3,275
Columns: 6
$ System         <chr> "CPR", "CPR", "CPR", "CPR", "CPR", "CPR", "CPR", "CPR",…
$ Subject        <chr> "Newly Registered Child Protection Cases", "Newly Regis…
$ Category       <chr> "Types of Harm/Maltreatment", "Types of Harm/Maltreatme…
$ Item           <chr> "Physical abuse", "Neglect", "Sexual abuse", "Psycholog…
$ Year           <dbl> 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2…
$ `No. of Cases` <dbl> 413, 41, 234, 23, 52, 462, 301, 454, 24, 12, 36, 21, 35…

Explore Unique Column Values

unique(df$Item)
  [1] "Physical abuse"                        
  [2] "Neglect"                               
  [3] "Sexual abuse"                          
  [4] "Psychological abuse"                   
  [5] "Multiple abuse"                        
  [6] "Female"                                
  [7] "Male"                                  
  [8] "Parent"                                
  [9] "Sibling"                               
 [10] "Step-parent"                           
 [11] "Grandparent"                           
 [12] "Relative"                              
 [13] "Family friend / Friend"                
 [14] "Family friend / Parent of peer"        
 [15] "Caregiver"                             
 [16] "Teacher"                               
 [17] "School teacher / personnel"            
 [18] "Staff of boarding section of school"   
 [19] "Tutor / Coach"                         
 [20] "Religious personnel"                   
 [21] "Schoolmate / friend / peer"            
 [22] "Co-tenant / Neighbour"                 
 [23] "Inmate of residential service"         
 [24] "Unrelated person (including strangers)"
 [25] "Unidentified Person / Others"          
 [26] "Unidentified person"                   
 [27] "Other"                                 
 [28] "Chinese"                               
 [29] "Pakistani"                             
 [30] "Filipino"                              
 [31] "Indonesian"                            
 [32] "Indian"                                
 [33] "African"                               
 [34] "New Zealander"                         
 [35] "Australian"                            
 [36] "Thai"                                  
 [37] "Vietnamese"                            
 [38] "Others"                                
 [39] "Unknown"                               
 [40] "Central & Western"                     
 [41] "Southern"                              
 [42] "Islands"                               
 [43] "Eastern"                               
 [44] "Wan Chai"                              
 [45] "Kowloon City"                          
 [46] "Yau Tsim Mong"                         
 [47] "Sham Shui Po"                          
 [48] "Wong Tai Sin"                          
 [49] "Sai Kung"                              
 [50] "Kwun Tong"                             
 [51] "Shatin"                                
 [52] "Tai Po"                                
 [53] "North"                                 
 [54] "Yuen Long"                             
 [55] "Tsuen Wan"                             
 [56] "Kwai Tsing"                            
 [57] "Tuen Mun"                              
 [58] "Outside Hong Kong"                     
 [59] "Physical violence"                     
 [60] "Sexual violence"                       
 [61] "Multiple violence"                     
 [62] "Husband"                               
 [63] "Wife"                                  
 [64] "Male Cohabitant"                       
 [65] "Female Cohabitant"                     
 [66] "Estranged husband / ex-husband"        
 [67] "Estranged wife / ex-wife"              
 [68] "Boyfriend"                             
 [69] "Girlfriend"                            
 [70] "Heterosexual cohabitant"               
 [71] "Same-sex cohabitant"                   
 [72] "Heterosexual ex-cohabitant"            
 [73] "Same-sex ex-cohabitant"                
 [74] "White"                                 
 [75] "Nepalese"                              
 [76] "Japanese"                              
 [77] "Other Asian"                           
 [78] "Island"                                
 [79] "Outside Hong Kong/Unknown"             
 [80] "Rape/Unlawful Sexual Intercourse"      
 [81] "Indecent assault"                      
 [82] "Forced masturbation"                   
 [83] "Forced oral sex"                       
 [84] "Unlawful Buggery"                      
 [85] "Multiple Abuse"                        
 [86] "Parent/child/sibling/in-laws"          
 [87] "Child"                                 
 [88] "In-law"                                
 [89] "Other relative"                        
 [90] "Heterosexual lover"                    
 [91] "Same-sex lover"                        
 [92] "Heterosexual ex-lover"                 
 [93] "Same-sex ex-lover"                     
 [94] "Friend"                                
 [95] "Caregiver (Non-relative)"              
 [96] "Employer / employee / colleague"       
 [97] "Teacher / tutor"                       
 [98] "Stranger"                              
 [99] "British"                               
[100] "Canadian"                              
[101] "Russian"                               
[102] "Italian"                               
[103] "Nigerian"                              
[104] "Malaysian"                             
[105] "Mongolian"                             
[106] "Sri Lankan"                            
[107] "Singaporean"                           
[108] "French"                                
[109] "Bengali"                               
[110] "Korean"                                
[111] "American"                              
[112] "German"                                

Comprehensive Summary of the Data

library(skimr)
skim(df)
Data summary
Name df
Number of rows 3275
Number of columns 6
_______________________
Column type frequency:
character 4
numeric 2
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
System 0 1 3 8 0 2 0
Subject 0 1 36 50 0 3 0
Category 0 1 13 54 0 12 0
Item 0 1 4 38 0 112 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
Year 0 1.00 2014.16 5.54 2005 2009 2014 2019 2023 ▇▇▆▇▇
No. of Cases 773 0.76 171.87 462.22 0 12 37 128 5575 ▇▁▁▁▁

Data Cleaning

Clean up Column Names

df_clean <- df |>
  rename(
    system = System,
    subject = Subject,
    category = Category,
    item = Item,
    year = Year,
    cases = `No. of Cases`
    )

glimpse(df_clean)
Rows: 3,275
Columns: 6
$ system   <chr> "CPR", "CPR", "CPR", "CPR", "CPR", "CPR", "CPR", "CPR", "CPR"…
$ subject  <chr> "Newly Registered Child Protection Cases", "Newly Registered …
$ category <chr> "Types of Harm/Maltreatment", "Types of Harm/Maltreatment", "…
$ item     <chr> "Physical abuse", "Neglect", "Sexual abuse", "Psychological a…
$ year     <dbl> 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2005, 2…
$ cases    <dbl> 413, 41, 234, 23, 52, 462, 301, 454, 24, 12, 36, 21, 35, NA, …

Save the Cleaned Data

saveRDS(df_clean, "data/child_protection_clean.rds")