# Creating a data set of UK Supreme Court decisions
# Stein Arne Brekke
# June 2024
# install.packages("rvest") # rvest does web scraping:
# install.packages("stringr") # stringr is useful for editing strings
# install.packages("ggplot2") # ggplot2 is used for creating graphs
# install.packages("tibble") # tibble is a user friendly package to deal with data frames
library(rvest)
library(stringr)
library(ggplot2)
library(tibble)
# SCRAPING THE UK SUPREME COURT ####
# Loop through years
for(y in 2009:as.numeric(format(Sys.Date(), "%Y"))){
## Download a list of cases ####
url <- paste0("https://www.supremecourt.uk/decided-cases/", y, ".html")
# The current year is different
if(y == as.numeric(format(Sys.Date(), "%Y"))){
url <- paste0("https://www.supremecourt.uk/decided-cases/")
}
page <- read_html(url)
# extract part of document that contains table of cases:
table_code <- html_elements(page, xpath="//table")
# If we want to see the table in the console, we can run the
# following:
# message(table_code)
table <- html_table(table_code)[[1]] # Read table from HTML code
# transform case ID variable to be a proper list
table$ID_list <- table$`Case ID` |>
str_replace_all("(.\\S)\\s*(UKSC)", "\\1&\\2") |> # Divide by & when there are multiple codes
str_split("&") # Split at &
# Remove all duplicated rows and create "judgments" data frame
judgments <- table[!duplicated(table$`Neutral citation`),]
# Insert case numbers from table data frame to prevent loss of data
# in duplicated cases
judgments$ID_list <-
lapply(judgments$`Neutral citation`, function(y)
unique(unlist(table$ID_list[which(table$`Neutral citation` == y)]))
)
# Now that we've streamlined the data, the Case ID column is no
# longer accurate. We can update it:
judgments$`Case ID` <- sapply(judgments$ID_list, function(y) paste(y, collapse=", "))
# Create a "flat" list of all case numbers:
list_of_cases <- unlist(judgments$ID_list)
# Create a data frame on the case level
# I include a few variables I know I want to fill in later on.
cases <- tibble(id = list_of_cases,
citation = NA,
summary_issue = NA,
summary_facts = NA)
# Loop through all cases and add their "neutral citation" from the
# original table
for(i in cases$id){
cases$citation[which(cases$id == i)] <-
judgments$`Neutral citation`[grep(i, judgments$`Case ID`)]
}
# Fix a bug where the wrong case ID is listed with [2015] UKSC 41
if(TRUE %in% duplicated(cases$id)){
cases$id[which(cases$citation == "[2015] UKSC 41")] <- "UKSC 2014/0272"
if(y == 2017){
cases <- cases[which(!duplicated(cases$id)),]
}
}
## Scrape case-level data ####
url_base <- "https://www.supremecourt.uk/cases/"
cases$web_id <- cases$id |>
str_replace_all("\\s|/", "-") |> # Replace spaces (\\s) or (|) slash (/) with dashes
tolower() # make lowercase
# # I define i as the web ID of the first row in the cases data set
# i <- cases$web_id[1]
# You would normally begin the loop here instead:
for(i in cases$web_id){
# We create a full URL by pasting together the base, ID number, and "html"
url <- paste0(url_base, i, ".html")
# and then we read it into object "page":
page <- NA
try(page <- read_html(url), silent = TRUE)
if(is.na(page)){
# Take a longer break before trying again
Sys.sleep(10)
try(page <- read_html(url), silent = TRUE)
}
if(!is.na(page)){
# We also define an object x, which refers to the row in the cases
# data set corresponding to the current observation. As long as i
# is defined as the first value in cases$web_id, x will always be 1.
x <- which(cases$web_id == i)
## Extracting case-level data ####
# case title
cases[x, "title"] <- page |>
html_elements(xpath="//title") |>
html_text() |>
str_remove("\\s?- The Supreme Court")
issue <- page |>
html_elements(xpath='//*[preceding-sibling::h4[. = \'Issue\']
and following-sibling::h4[. = \'Facts\']]')
# Sometimes the headline is "Issues"
if(length(issue) == 0){
issue <- page |>
html_elements(xpath='//*[preceding-sibling::h4[. = \'Issues\']
and following-sibling::h4[. = \'Facts\']]')
}
facts <- page |>
html_elements(xpath='//*[preceding-sibling::h4[. = \'Facts\']
and following-sibling::h4[. = \'Judgment appealed\']]')
if(length(facts) == 0){
facts <- page |>
html_elements(xpath='//*[preceding-sibling::h4[. = \'Facts\']
and following-sibling::h3[. = \'Parties\']]')
}
# Convert text variables into list objects:
cases[[x, "summary_issue"]] <- list(html_text(issue))
cases[[x, "summary_facts"]] <- list(html_text(facts))
# In the following, h4 is replaced by * to work with older entries.
# Justices
cases[x, "justices"] <-
html_text(html_elements(page,
xpath='//p[preceding-sibling::*[. = "Justices"]]')[1])[1]
# Hearing start date
cases[x, "date_hearing_start"] <-
html_text(html_elements(page,
xpath='//p[preceding-sibling::*[. = "Hearing start date"]]')[1])[1]
# Hearing finish date
cases[x, "date_hearing_finish"] <-
html_text(html_elements(page,
xpath='//p[preceding-sibling::*[. = "Hearing finish date"]]')[1])[1]
# Judgment date
cases[x, "date_judgment"] <-
html_text(html_elements(page,
xpath='//p[preceding-sibling::*[. = "Judgment date"]]')[1])[1]
text_link <- page |>
html_elements(xpath='//a[@title = "Find out more on the BAILII website"]')
# in older cases the BAILII link is different
if(length(text_link) == 0){
text_link <- page |>
html_elements(xpath='//a[@title = "Opens new window"]')
text_link <- text_link[grep("Judgment on BAILII", text_link)]
if(TRUE %in% grepl("HTML version\\)", text_link)){
text_link <- text_link[grep("HTML version\\)", text_link)]
}
}
# Keep only the first observed BAILII link
text_link <- text_link[grep("Judgment on BAILII", html_text(text_link))][1]
# Insert URL (href) into table:
cases[x, "text_url"] <- html_attr(text_link, "href")[1]
message(y, " - ", x, "/", nrow(cases), " - ", cases$title[x])
# Rest for between 0.5 and 2 seconds, to give the server a little break.
Sys.sleep(runif(1, .5, 2))
}
}
# Create data tables containing all information
if(exists("uksc_cases")){
uksc_judgments <- rbind(uksc_judgments, judgments)
uksc_cases <- rbind(uksc_cases, cases)
setwd("~/Dokumenter/UKSC data")
save(uksc_judgments, file="uksc_judgments-partial.rda")
save(uksc_cases, file="uksc_cases-partial.rda")
} else {
uksc_cases <- cases
uksc_judgments <- judgments
}
}
cases <- uksc_cases
judgments <- uksc_judgments
# DATA MANAGEMENT ####
## Converting dates ####
# We need to set the language of R to English for the following line to work
Sys.setlocale("LC_TIME", "C")
# if you're having problems here, try the following:
# Sys.setlocale("LC_TIME", "English")
# Sys.setlocale("LC_TIME", "en_UK")
date_judgment <- cases$date_judgment
cases$date_hearing_finish <- cases$date_hearing_finish |>
str_replace("Apri ", "April ")
cases$date_hearing_finish <- as.Date(cases$date_hearing_finish, format = "%d %B %Y")
cases$date_hearing_start <- as.Date(cases$date_hearing_start, format = "%d %B %Y")
cases$date_judgment <- as.Date(cases$date_judgment, format = "%d %B %Y")
cases$length_of_hearing <- (cases$date_hearing_finish - cases$date_hearing_start)+1
cases$length_of_procedure <- cases$date_judgment - cases$date_hearing_start
for(y in unique(gsub("\\[(\\d{4})\\].*$", "\\1", cases$citation))){
x <- grep(y, cases$citation)
bad_dates <- x[which(format(cases$date_judgment[x], "%Y") != y)]
if(length(bad_dates) > 0){
cases$date_judgment[bad_dates] <- cases$date_judgment[bad_dates] |>
str_replace("\\d{4}", as.character(y)) |>
as.Date()
}
}
cases$date_hearing_start[which(cases$date_hearing_start > cases$date_judgment)] <- NA
cases$length_of_procedure <- cases$date_judgment - cases$date_hearing_start
## Managing text data ####
# Additional cleaning of names of justices
cases$justices_list <- cases$justices |>
str_remove("\\s*$") |> # remove trailing spaces
str_replace_all(",? and ", ", ") |>
str_replace_all("\\. ([[:upper:]])", ", \\1") |>
str_replace_all("\\(.*\\)", "") |>
str_replace_all("–", "-") |>
str_replace("Lloyd.Jones", "Lloyd-Jones") |>
str_replace("rrr", "rr") |>
str_replace("Wlson", "Wilson") |>
str_replace_all(",\\W*|&", ", ") |>
str_split(", ")
cases$justices_number <- sapply(cases$justices_list, length)
# Number of male and female justices
cases$justices_male <-
sapply(cases$justices_list, function(y)
length(grep("Lord|Sir", y)))
cases$justices_female <-
sapply(cases$justices_list, function(y)
length(grep("Lady|Dame", y)))
cases$justices_list <-
lapply(cases$justices_list, function(y) y |>
str_replace_all("^\\W*|\\W*$", "") |>
str_replace_all("\\W*\\s", " ") |>
str_replace_all("\\W*[[:upper:]]+$", "")
)
# Who are the justices of the Court, and how often do they appear?
table(unlist(cases$justices_list))
# Migration cases
cases$migration <-
grepl("migration|migrant|asyl", cases$summary_issue, ignore.case = TRUE) |
grepl("migration|migrant|asyl", cases$summary_facts, ignore.case = TRUE)
# Human rights cases
cases$human_rights <-
grepl("human rights|ECHR|ECtHR", cases$summary_issue, ignore.case = TRUE) |
grepl("human rights|ECHR|ECtHR", cases$summary_facts, ignore.case = TRUE)
# Cases mentioning the EU
cases$european_union <-
grepl("EU|European Union", cases$summary_issue, ignore.case = FALSE) |
grepl("EU|European Union", cases$summary_facts, ignore.case = FALSE)
missing_texts <- which(sapply(cases$summary_issue, length) == 0 & sapply(cases$summary_facts, length) == 0)
#
cases$european_union[missing_texts] <- NA
cases$human_rights[missing_texts] <- NA
cases$migration[missing_texts] <- NA
# Length in paragraphs
cases$facts_n_paragraphs <-
sapply(cases$summary_facts, length)
cases$issue_n_paragraphs <-
sapply(cases$summary_issue, length)
# Length in number of characters
cases$facts_n_characters <-
sapply(cases$summary_facts, function(y) sum(nchar(y)))
cases$issue_n_characters <-
sapply(cases$summary_issue, function(y) sum(nchar(y)))
## Aggregating data back to judgment level ####
judgments$length_of_procedure <-
cases$length_of_procedure[match(judgments$`Neutral citation`,
cases$citation)]
judgments$length_of_hearing <-
cases$length_of_hearing[match(judgments$`Neutral citation`,
cases$citation)]
judgments$date_hearing_start <-
cases$date_hearing_start[match(judgments$`Neutral citation`,
cases$citation)]
judgments$date_judgment <-
cases$date_judgment[match(judgments$`Neutral citation`,
cases$citation)]
# number of joined cases
judgments$n_joined_cases <- sapply(judgments$ID_list, length)
# Related to the EU?
## 1. set all observations as NA
judgments$european_union <- NA
## 2. set non-NA cases as FALSE
judgments$european_union[which(sapply(judgments$ID_list, function(y)
TRUE %in% (y %in% cases$id[which(!is.na(cases$european_union))])
))] <- FALSE
## 3. set judgments where any of the related cases are related to the EU as TRUE
judgments$european_union[which(sapply(judgments$ID_list, function(y)
TRUE %in% (y %in% cases$id[which(cases$european_union)])
))] <- TRUE
# Related to human rights?
judgments$human_rights <- NA
judgments$human_rights[which(sapply(judgments$ID_list, function(y)
TRUE %in% (y %in% cases$id[which(!is.na(cases$human_rights))])
))] <- FALSE
judgments$human_rights[which(sapply(judgments$ID_list, function(y)
TRUE %in% (y %in% cases$id[which(cases$human_rights)])
))] <- TRUE
# Length of issues
judgments$length_issue <-
sapply(judgments$`Neutral citation`, function(y)
mean(cases$issue_n_characters[which(cases$citation == y)])
)
# Length of facts
judgments$length_facts <-
sapply(judgments$`Neutral citation`, function(y)
mean(cases$facts_n_characters[which(cases$citation == y)])
)
# Number of justices
judgments$justices_number <-
cases$justices_number[match(judgments$`Neutral citation`,
cases$citation)]
# Binary variable for chamber larger than five justices
judgments$big_chamber <- judgments$justices_number > 5
# Binary variable observing whether a female justice was present
judgments$female_justice <-
judgments$`Neutral citation` %in%
cases$citation[which(cases$justices_female != 0)]
# Alternative formulation. Note that this code would need to be
# expanded if there was ever an all-female chamber.
judgments$gender_balance <-
ifelse(judgments$female_justice, "Mixed gender", "Male only")
# Categorical variable observing if the hearing is short (one day),
# medium (two days), or long (three or more days)
judgments$hearing_length <-
ifelse(judgments$length_of_hearing == 1, "short",
ifelse(judgments$length_of_hearing == 2, "medium",
"long"
))
# Binary variables for short and long hearings:
judgments$short_hearing <- judgments$hearing_length == "short"
judgments$long_hearing <- judgments$hearing_length == "long"
# Shortened version of case name
judgments$case_name_short <- judgments$`Case name` |>
str_remove_all("\\([^\\)]*\\)") |>
str_remove_all("\\)|\\(") |>
str_replace_all("\\s+", " ") |>
str_replace_all("\\s+$|^\\s*", "")
cases$case_name_short <- cases$title |>
str_remove_all("\\([^\\)]*\\)") |>
str_remove_all("\\)|\\(") |>
str_replace_all("\\s+", " ") |>
str_replace_all("\\s+$|^\\s*", "")
uksc_judgments <- judgments
uksc_cases <- cases
# Fix multiple judgment citations in single row
library(splitstackshape)
uksc_judgments$`Neutral citation` <- gsub(",\\W*|\\W*&\\W*\\[", "&", uksc_judgments$`Neutral citation`)
uksc_judgments <- cSplit(uksc_judgments, "Neutral citation", "&", "long")
uksc_cases$citation <- gsub(",\\W*|\\W*&\\W*\\[", "&", uksc_cases$citation)
uksc_cases <- cSplit(uksc_cases, "citation", "&", "long")
# Lowercase variable names without spaces
colnames(uksc_judgments) <- colnames(uksc_judgments) |>
tolower() |>
str_replace_all("\\s+", "_")
# Remove useless columns
uksc_judgments$judgment_date <- NULL
uksc_judgments$case_id <- NULL
uksc_cases$web_id <- NULL
uksc_cases$justices <- NULL
colnames(uksc_judgments)[1] <- "citation"
colnames(uksc_judgments)[3] <- "case_list"
# Remove duplicated observation
uksc_judgments <- uksc_judgments[which(!duplicated(uksc_judgments$citation)),]
uksc_cases <- uksc_cases[which(!duplicated(uksc_cases$id)),]
uksc_cases <- as.data.frame(uksc_cases)
uksc_judgments <- as.data.frame(uksc_judgments)
setwd("~/Dokumenter/UKSC data")
save(uksc_judgments, file="uksc_judgments.rda")
save(uksc_cases, file="uksc_cases.rda")
# Save as CSV
uksc_cases_flat <- uksc_cases
uksc_judgments_flat <- uksc_judgments
for(i in colnames(uksc_cases_flat)){
if(class(uksc_cases_flat[,i]) == "list"){
uksc_cases_flat[,i] <-
sapply(uksc_cases_flat[,i], function(y)
paste(gsub(";", ",", y), collapse="; ") |>
str_replace_all("\\s*;\\s*", "; ") |>
str_remove_all("^\\W*;|;\\W*$")
)
}
}
for(i in colnames(uksc_judgments_flat)){
if(class(uksc_judgments_flat[,i]) == "list"){
uksc_judgments_flat[,i] <-
sapply(uksc_judgments_flat[,i], function(y)
paste(gsub(";", ",", y), collapse="; ") |>
str_replace_all("\\s*;\\s*", "; ") |>
str_remove_all("^\\W*;|;\\W*$")
)
}
}
write.csv(uksc_judgments_flat, file="uksc_judgments.csv")
write.csv(uksc_cases_flat, file="uksc_cases.csv")