Download script

uksc.R

# Creating a data set of UK Supreme Court decisions
# Stein Arne Brekke
# June 2024


# install.packages("rvest") # rvest does web scraping:
# install.packages("stringr") # stringr is useful for editing strings
# install.packages("ggplot2") # ggplot2 is used for creating graphs 
# install.packages("tibble") # tibble is a user friendly package to deal with data frames

library(rvest)
library(stringr)
library(ggplot2)
library(tibble)

# SCRAPING THE UK SUPREME COURT ####

# Loop through years
for(y in 2009:as.numeric(format(Sys.Date(), "%Y"))){
  
  ## Download a list of cases ####
  
  url <- paste0("https://www.supremecourt.uk/decided-cases/", y, ".html")
  
  # The current year is different
  if(y == as.numeric(format(Sys.Date(), "%Y"))){
    url <- paste0("https://www.supremecourt.uk/decided-cases/")
  }
  
  page <- read_html(url)
  
  # extract part of document that contains table of cases:
  table_code <- html_elements(page, xpath="//table") 
  
  # If we want to see the table in the console, we can run the 
  # following:
  # message(table_code)
  
  table <- html_table(table_code)[[1]] # Read table from HTML code
  
  # transform case ID variable to be a proper list
  table$ID_list <- table$`Case ID` |>
    str_replace_all("(.\\S)\\s*(UKSC)", "\\1&\\2") |> # Divide by & when there are multiple codes
    str_split("&") # Split at &
  
  # Remove all duplicated rows and create "judgments" data frame
  judgments <- table[!duplicated(table$`Neutral citation`),]
  
  # Insert case numbers from table data frame to prevent loss of data
  # in duplicated cases
  judgments$ID_list <-
    lapply(judgments$`Neutral citation`, function(y)
      unique(unlist(table$ID_list[which(table$`Neutral citation` == y)]))
    )
  
  # Now that we've streamlined the data, the Case ID column is no
  # longer accurate. We can update it:
  judgments$`Case ID` <- sapply(judgments$ID_list, function(y) paste(y, collapse=", "))
  # Create a "flat" list of all case numbers:
  list_of_cases <- unlist(judgments$ID_list)
  
  # Create a data frame on the case level
  # I include a few variables I know I want to fill in later on.
  cases <- tibble(id = list_of_cases,
                  citation = NA,
                  summary_issue = NA,
                  summary_facts = NA)
  
  # Loop through all cases and add their "neutral citation" from the
  # original table
  for(i in cases$id){
    cases$citation[which(cases$id == i)] <-
      judgments$`Neutral citation`[grep(i, judgments$`Case ID`)]
  }
  
  # Fix a bug where the wrong case ID is listed with [2015] UKSC 41
  if(TRUE %in% duplicated(cases$id)){
    cases$id[which(cases$citation == "[2015] UKSC 41")] <- "UKSC 2014/0272"
    if(y == 2017){
      cases <- cases[which(!duplicated(cases$id)),]
    }
    
  }
  
  
  ## Scrape case-level data ####
  
  url_base <- "https://www.supremecourt.uk/cases/"
  
  
  
  cases$web_id <- cases$id |>
    str_replace_all("\\s|/", "-") |> # Replace spaces (\\s) or (|) slash (/) with dashes
    tolower() # make lowercase
  
  
  # # I define i as the web ID of the first row in the cases data set
  # i <- cases$web_id[1] 
  
  # You would normally begin the loop here instead:
  for(i in cases$web_id){
    
    # We create a full URL by pasting together the base, ID number, and "html"
    url <- paste0(url_base, i, ".html")
    
    # and then we read it into object "page":
    page <- NA
    try(page <- read_html(url), silent = TRUE)
    if(is.na(page)){
      # Take a longer break before trying again
      Sys.sleep(10)
      try(page <- read_html(url), silent = TRUE)
    }
    if(!is.na(page)){
      # We also define an object x, which refers to the row in the cases
      # data set corresponding to the current observation. As long as i 
      # is defined as the first value in cases$web_id, x will always be 1.
      x <- which(cases$web_id == i)
      
      ## Extracting case-level data ####
      # case title
      cases[x, "title"] <- page |>
        html_elements(xpath="//title") |>
        html_text() |>
        str_remove("\\s?- The Supreme Court")
      
      issue <- page |>
        html_elements(xpath='//*[preceding-sibling::h4[. = \'Issue\']
                        and following-sibling::h4[. = \'Facts\']]')
      
      # Sometimes the headline is "Issues"
      if(length(issue) == 0){
        issue <- page |>
          html_elements(xpath='//*[preceding-sibling::h4[. = \'Issues\']
                            and following-sibling::h4[. = \'Facts\']]')
      }
      
      facts <- page |>
        html_elements(xpath='//*[preceding-sibling::h4[. = \'Facts\']
                        and following-sibling::h4[. = \'Judgment appealed\']]')
      
      if(length(facts) == 0){
        facts <- page |>
          html_elements(xpath='//*[preceding-sibling::h4[. = \'Facts\']
                        and following-sibling::h3[. = \'Parties\']]')
      }
      
      # Convert text variables into list objects:
      cases[[x, "summary_issue"]] <- list(html_text(issue))
      cases[[x, "summary_facts"]] <- list(html_text(facts))
      
      # In the following, h4 is replaced by * to work with older entries. 
      
      # Justices
      cases[x, "justices"] <-
        html_text(html_elements(page, 
                                xpath='//p[preceding-sibling::*[. = "Justices"]]')[1])[1]
      
      # Hearing start date
      cases[x, "date_hearing_start"] <-
        html_text(html_elements(page, 
                                xpath='//p[preceding-sibling::*[. = "Hearing start date"]]')[1])[1]
      
      # Hearing finish date
      cases[x, "date_hearing_finish"] <-
        html_text(html_elements(page, 
                                xpath='//p[preceding-sibling::*[. = "Hearing finish date"]]')[1])[1]
      
      # Judgment date
      cases[x, "date_judgment"] <-
        html_text(html_elements(page, 
                                xpath='//p[preceding-sibling::*[. = "Judgment date"]]')[1])[1]
      
      text_link <- page |>
        html_elements(xpath='//a[@title = "Find out more on the BAILII website"]')
      
      # in older cases the BAILII link is different
      if(length(text_link) == 0){
        text_link <- page |>
          html_elements(xpath='//a[@title = "Opens new window"]')
        text_link <- text_link[grep("Judgment on BAILII", text_link)]
        if(TRUE %in% grepl("HTML version\\)", text_link)){
          text_link <- text_link[grep("HTML version\\)", text_link)]
        }
      }
      
      # Keep only the first observed BAILII link
      text_link <- text_link[grep("Judgment on BAILII", html_text(text_link))][1]
      
      # Insert URL (href) into table:
      cases[x, "text_url"] <- html_attr(text_link, "href")[1]
      
      message(y, " - ", x, "/", nrow(cases), " - ", cases$title[x])
      # Rest for between 0.5 and 2 seconds, to give the server a little break.
      Sys.sleep(runif(1, .5, 2))
    }
  }
  # Create data tables containing all information
  if(exists("uksc_cases")){
    uksc_judgments <- rbind(uksc_judgments, judgments)
    uksc_cases <- rbind(uksc_cases, cases)
    
    setwd("~/Dokumenter/UKSC data")
    save(uksc_judgments, file="uksc_judgments-partial.rda")
    save(uksc_cases, file="uksc_cases-partial.rda")
  } else {
    uksc_cases <- cases
    uksc_judgments <- judgments
  }
}

cases <- uksc_cases
judgments <- uksc_judgments

# DATA MANAGEMENT ####
## Converting dates ####

# We need to set the language of R to English for the following line to work
Sys.setlocale("LC_TIME", "C") 
# if you're having problems here, try the following:
# Sys.setlocale("LC_TIME", "English")
# Sys.setlocale("LC_TIME", "en_UK")


date_judgment <- cases$date_judgment
cases$date_hearing_finish <- cases$date_hearing_finish |>
  str_replace("Apri ", "April ")
cases$date_hearing_finish <- as.Date(cases$date_hearing_finish, format = "%d %B %Y")
cases$date_hearing_start <- as.Date(cases$date_hearing_start, format = "%d %B %Y")
cases$date_judgment  <- as.Date(cases$date_judgment, format = "%d %B %Y")


cases$length_of_hearing <- (cases$date_hearing_finish - cases$date_hearing_start)+1
cases$length_of_procedure <- cases$date_judgment - cases$date_hearing_start

for(y in unique(gsub("\\[(\\d{4})\\].*$", "\\1", cases$citation))){
  x <- grep(y, cases$citation)
  bad_dates <- x[which(format(cases$date_judgment[x], "%Y") != y)]
  if(length(bad_dates) > 0){
    cases$date_judgment[bad_dates] <- cases$date_judgment[bad_dates] |>
      str_replace("\\d{4}", as.character(y)) |>
      as.Date()
  }
}
cases$date_hearing_start[which(cases$date_hearing_start > cases$date_judgment)] <- NA
cases$length_of_procedure <- cases$date_judgment - cases$date_hearing_start



## Managing text data #### 

# Additional cleaning of names of justices
cases$justices_list <- cases$justices |>
  str_remove("\\s*$") |> # remove trailing spaces
  str_replace_all(",? and ", ", ") |> 
  str_replace_all("\\. ([[:upper:]])", ", \\1") |>  
  str_replace_all("\\(.*\\)", "") |> 
  str_replace_all("–", "-") |>
  str_replace("Lloyd.Jones", "Lloyd-Jones") |>
  str_replace("rrr", "rr") |>
  str_replace("Wlson", "Wilson") |>
  str_replace_all(",\\W*|&", ", ") |>
  str_split(", ")


cases$justices_number <- sapply(cases$justices_list, length)

# Number of male and female justices
cases$justices_male <- 
  sapply(cases$justices_list, function(y) 
    length(grep("Lord|Sir", y)))

cases$justices_female <- 
  sapply(cases$justices_list, function(y) 
    length(grep("Lady|Dame", y)))



cases$justices_list <- 
  lapply(cases$justices_list, function(y) y |>
           str_replace_all("^\\W*|\\W*$", "") |>
           str_replace_all("\\W*\\s", " ") |>
           str_replace_all("\\W*[[:upper:]]+$", "")
  )

# Who are the justices of the Court, and how often do they appear?
table(unlist(cases$justices_list))

# Migration cases
cases$migration <- 
  grepl("migration|migrant|asyl", cases$summary_issue, ignore.case = TRUE) |
  grepl("migration|migrant|asyl", cases$summary_facts, ignore.case = TRUE)

# Human rights cases
cases$human_rights <- 
  grepl("human rights|ECHR|ECtHR", cases$summary_issue, ignore.case = TRUE) |
  grepl("human rights|ECHR|ECtHR", cases$summary_facts, ignore.case = TRUE)


# Cases mentioning the EU
cases$european_union <- 
  grepl("EU|European Union", cases$summary_issue, ignore.case = FALSE) |
  grepl("EU|European Union", cases$summary_facts, ignore.case = FALSE)

missing_texts <- which(sapply(cases$summary_issue, length) == 0 & sapply(cases$summary_facts, length) == 0)

# 
cases$european_union[missing_texts] <- NA
cases$human_rights[missing_texts] <- NA
cases$migration[missing_texts] <- NA

# Length in paragraphs
cases$facts_n_paragraphs <- 
  sapply(cases$summary_facts, length)
cases$issue_n_paragraphs <- 
  sapply(cases$summary_issue, length)

# Length in number of characters
cases$facts_n_characters <- 
  sapply(cases$summary_facts, function(y) sum(nchar(y)))
cases$issue_n_characters <- 
  sapply(cases$summary_issue, function(y) sum(nchar(y)))


## Aggregating data back to judgment level ####

judgments$length_of_procedure <- 
  cases$length_of_procedure[match(judgments$`Neutral citation`,
                                  cases$citation)]
judgments$length_of_hearing <- 
  cases$length_of_hearing[match(judgments$`Neutral citation`, 
                                cases$citation)]

judgments$date_hearing_start <- 
  cases$date_hearing_start[match(judgments$`Neutral citation`,
                                 cases$citation)]
judgments$date_judgment <- 
  cases$date_judgment[match(judgments$`Neutral citation`,
                            cases$citation)]

# number of joined cases
judgments$n_joined_cases <- sapply(judgments$ID_list, length)

# Related to the EU?

## 1. set all observations as NA
judgments$european_union <- NA

## 2. set non-NA cases as FALSE
judgments$european_union[which(sapply(judgments$ID_list, function(y)
  TRUE %in% (y %in% cases$id[which(!is.na(cases$european_union))])
))] <- FALSE

## 3. set judgments where any of the related cases are related to the EU as TRUE
judgments$european_union[which(sapply(judgments$ID_list, function(y)
  TRUE %in% (y %in% cases$id[which(cases$european_union)])
))] <- TRUE


# Related to human rights?
judgments$human_rights <- NA
judgments$human_rights[which(sapply(judgments$ID_list, function(y)
  TRUE %in% (y %in% cases$id[which(!is.na(cases$human_rights))])
))] <- FALSE
judgments$human_rights[which(sapply(judgments$ID_list, function(y)
  TRUE %in% (y %in% cases$id[which(cases$human_rights)])
))] <- TRUE


# Length of issues
judgments$length_issue <- 
  sapply(judgments$`Neutral citation`, function(y)
    mean(cases$issue_n_characters[which(cases$citation == y)])
  )

# Length of facts
judgments$length_facts <- 
  sapply(judgments$`Neutral citation`, function(y)
    mean(cases$facts_n_characters[which(cases$citation == y)])
  )

# Number of justices
judgments$justices_number <- 
  cases$justices_number[match(judgments$`Neutral citation`,
                              cases$citation)]

# Binary variable for chamber larger than five justices
judgments$big_chamber <- judgments$justices_number > 5

# Binary variable observing whether a female justice was present
judgments$female_justice <- 
  judgments$`Neutral citation` %in% 
  cases$citation[which(cases$justices_female != 0)]

# Alternative formulation. Note that this code would need to be 
# expanded if there was ever an all-female chamber. 
judgments$gender_balance <- 
  ifelse(judgments$female_justice, "Mixed gender", "Male only")

# Categorical variable observing if the hearing is short (one day),
# medium (two days), or long (three or more days)
judgments$hearing_length <- 
  ifelse(judgments$length_of_hearing == 1, "short",
         ifelse(judgments$length_of_hearing == 2, "medium", 
                "long"
         ))

# Binary variables for short and long hearings:
judgments$short_hearing <- judgments$hearing_length == "short"
judgments$long_hearing <- judgments$hearing_length == "long"

# Shortened version of case name
judgments$case_name_short <- judgments$`Case name` |>
  str_remove_all("\\([^\\)]*\\)") |>
  str_remove_all("\\)|\\(") |>
  str_replace_all("\\s+", " ") |>
  str_replace_all("\\s+$|^\\s*", "")
cases$case_name_short <- cases$title |>
  str_remove_all("\\([^\\)]*\\)") |>
  str_remove_all("\\)|\\(") |>
  str_replace_all("\\s+", " ") |>
  str_replace_all("\\s+$|^\\s*", "")


uksc_judgments <- judgments
uksc_cases <- cases

# Fix multiple judgment citations in single row 
library(splitstackshape)
uksc_judgments$`Neutral citation` <- gsub(",\\W*|\\W*&\\W*\\[", "&", uksc_judgments$`Neutral citation`)
uksc_judgments <- cSplit(uksc_judgments, "Neutral citation", "&", "long")
uksc_cases$citation <- gsub(",\\W*|\\W*&\\W*\\[", "&", uksc_cases$citation)
uksc_cases <- cSplit(uksc_cases, "citation", "&", "long")

# Lowercase variable names without spaces
colnames(uksc_judgments) <- colnames(uksc_judgments) |> 
  tolower() |>
  str_replace_all("\\s+", "_")


# Remove useless columns
uksc_judgments$judgment_date <- NULL
uksc_judgments$case_id <- NULL

uksc_cases$web_id <- NULL
uksc_cases$justices <- NULL

colnames(uksc_judgments)[1] <- "citation"
colnames(uksc_judgments)[3] <- "case_list"

# Remove duplicated observation
uksc_judgments <- uksc_judgments[which(!duplicated(uksc_judgments$citation)),]
uksc_cases <- uksc_cases[which(!duplicated(uksc_cases$id)),]

uksc_cases <- as.data.frame(uksc_cases)
uksc_judgments <- as.data.frame(uksc_judgments)


setwd("~/Dokumenter/UKSC data")
save(uksc_judgments, file="uksc_judgments.rda")
save(uksc_cases, file="uksc_cases.rda")

# Save as CSV
uksc_cases_flat <- uksc_cases
uksc_judgments_flat <- uksc_judgments

for(i in colnames(uksc_cases_flat)){
  if(class(uksc_cases_flat[,i]) == "list"){
    uksc_cases_flat[,i] <- 
      sapply(uksc_cases_flat[,i], function(y) 
        paste(gsub(";", ",", y), collapse="; ") |>
          str_replace_all("\\s*;\\s*", "; ") |>
          str_remove_all("^\\W*;|;\\W*$")
      )
  }
}

for(i in colnames(uksc_judgments_flat)){
  if(class(uksc_judgments_flat[,i]) == "list"){
    uksc_judgments_flat[,i] <- 
      sapply(uksc_judgments_flat[,i], function(y) 
        paste(gsub(";", ",", y), collapse="; ") |>
          str_replace_all("\\s*;\\s*", "; ") |>
          str_remove_all("^\\W*;|;\\W*$")
      )
  }
}

write.csv(uksc_judgments_flat, file="uksc_judgments.csv")
write.csv(uksc_cases_flat, file="uksc_cases.csv")

← Go to data page