In-Class Exercise 6 -

Author

Alicia Loh

Published

May 18, 2024

Modified

May 18, 2024

Getting Started

Installing and loading the required libraries

The following R packages will be used:

corporaexplorer
stringi
rvest
readtext
tidyverse

Code chunk below will be used to check if these packages have been installed and also will load them into the working R environment.

pacman::p_load(corporaexplorer, stringi, rvest, readtext, tidyverse)

The Data

Downloading the King James Bible from Project Gutenberg:

Importing data

bible <- readr::read_lines("http://www.gutenberg.org/cache/epub/10/pg10.txt")

Collapse into a string

bible <- paste(bible, collapse = "\n")

Identify the beginning and end of the Bible

Technique borrowed from https://quanteda.io/articles/pkgdown/replication/digital-humanities.html

start_v <- stri_locate_first_fixed(bible, "The First Book of Moses: Called Genesis")[1]
end_v <- stri_locate_last_fixed(bible, "Amen.")[2]
bible <- stri_sub(bible, start_v, end_v)

Split string into books

Every book in the bible is preceded by five newlines, which can be used to split the string into a vector where each element is a book.

books <- stri_split_regex(bible, "\n{5}") %>%
    unlist %>%
    .[-40]  
# Removing the heading "The New Testament of the King James Bible"

Replacing newlines and space

books <- str_replace_all(books, "\n{2,}", "NEW_PARAGRAPH") %>%
    str_replace_all("\n", " ") %>%
    str_replace_all("NEW_PARAGRAPH", "\n\n")
books <- books[3:68]  # The two first elements are not books

Identifying chapters within books

chapters <- str_replace_all(books, "(\\d+:1 )", "NEW_CHAPTER\\1") %>%
    stri_split_regex("NEW_CHAPTER")

Remove chapter headings

chapters <- lapply(chapters, function(x) x[-1])

Retrieve shorter book titles

Retrieve shorter book titles from esv.org to save space in the corpus map plot.

book_titles <- read_html("https://www.esv.org/resources/esv-global-study-bible/list-of-abbreviations") %>%
  html_nodes("td:nth-child(1)") %>%
  html_text() %>%
  .[13:78]  # Removing irrelevant elements after manual inspection.

Identify belonging of book

Indicate whether a book belongs to the Old or New Testament.

testament <- c(rep("Old", 39), rep("New", 27))

# Data frame with one book as one row.
bible_df <- tibble::tibble(Text = chapters,
                           Book = book_titles,
                           Testament = testament)

# Each chapter to be one row, but keep the metadata (which book and which testament).
bible_df <- tidyr::unnest(bible_df, Text)

Organise Data

Corpus not organised by date, so date_based_corpus to FALSE.

KJB <- prepare_data(dataset = bible_df,
                    date_based_corpus = FALSE,
                    grouping_variable = "Book",
                    columns_doc_info = c("Testament", "Book"))

Explore Corpus

explore(KJB)

Shiny applications not supported in static R Markdown documents

Getting Started

Installing and loading the required libraries

The following R packages will be used:

ggforce
tidygraph
ggraph
visNetwork
skimr
tidytext
tidyverse
graphlayouts
jsonlite

Code chunk below will be used to check if these packages have been installed and also will load them into the working R environment.

pacman::p_load(ggforce, tidygraph, ggraph, 
               visNetwork, skimr, tidytext,
               tidyverse, graphlayouts, jsonlite)

Importing JSON File

mc3_data <- fromJSON("data/MC3.json")

Verify data type

class(mc3_data)

[1] "list"

Extract edges

mc3_edges <-
  as_tibble(mc3_data$links) %>%
  distinct() %>%
  mutate(source =
           as.character(source),
         target =
           as.character(target),
         type = as.character(type)) %>%
  group_by(source,target,type) %>% #to count number of unique links
  summarise(weights = n()) %>%
  filter(source!=target) %>%
  ungroup()

Extract nodes

mc3_nodes <-
  as_tibble(mc3_data$nodes) %>%
  mutate(country = as.character(country),
         id = as.character(id),
         product_services = as.character(product_services),
         revenue_omu = as.numeric(as.character(revenue_omu)),
         type = as.character(type)) %>%
  select(id, country, type, revenue_omu, product_services)

Modifying network nodes and edges

id1 <- mc3_edges %>%
  select(source) %>%
  rename(id = source)

id2 <- mc3_edges %>%
  select(target) %>%
  rename (id = target)
mc3_nodes1 <- rbind(id1, id2) %>%
  distinct() %>%
  left_join(mc3_nodes,
            unmatched = "drop")

Constructing graph

mc3_graph <- tbl_graph(nodes = mc3_nodes1,
                       edges = mc3_edges,
                       directed = FALSE) %>%
  mutate(betweeness_centrality = 
           centrality_betweenness(),
         closeness_centrality = 
           centrality_closeness())

mc3_graph

# A tbl_graph: 37324 nodes and 24036 edges
#
# A bipartite simple graph with 13330 components
#
# Node Data: 37,324 × 7 (active)
   id           country type  revenue_omu product_services betweeness_centrality
   <chr>        <chr>   <chr>       <dbl> <chr>                            <dbl>
 1 1 AS Marine… Islian… Comp…         NA  Scrapbook embel…                  6626
 2 1 Ltd. Liab… Mawand… Comp…         NA  Unknown                              0
 3 1 S.A. de C… Oceanus Comp…         NA  Unknown                              0
 4 1 and Sagl … Kondan… Comp…      18529. Total logistics…                     1
 5 2 Limited L… Marebak Comp…         NA  Canning, proces…                     6
 6 2 Limited L… Marebak Comp…         NA  Unknown                              0
 7 2 S.A. de C… Oceanus Comp…      12567. Unknown                              0
 8 3 Coast Sp … Puerto… Comp…         NA  Unknown                              0
 9 3 Limited L… Oceanus Comp…      26867. Fibres, yarns, …                     0
10 3 Ltd. Liab… Oceanus Comp…     112667. European specia…                     0
# ℹ 37,314 more rows
# ℹ 1 more variable: closeness_centrality <dbl>
#
# Edge Data: 24,036 × 4
   from    to type             weights
  <int> <int> <chr>              <int>
1     1 16060 Company Contacts       1
2     1 16061 Beneficial Owner       1
3     2 16062 Beneficial Owner       1
# ℹ 24,033 more rows

Graph Visualisation

mc3_graph %>%
  filter(betweeness_centrality >= 300000) %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(alpha = 0.5)) +
  geom_node_point(aes(
    size = betweeness_centrality,
    color = "lightblue",
    alpha = 0.5)) +
  scale_size_continuous(range=c(1,10)) +
  theme_graph()