::p_load(corporaexplorer, stringi, rvest, readtext, tidyverse) pacman
In-Class Exercise 6 -
Getting Started
Installing and loading the required libraries
The following R packages will be used:
corporaexplorer
stringi
rvest
readtext
tidyverse
Code chunk below will be used to check if these packages have been installed and also will load them into the working R environment.
The Data
Downloading the King James Bible from Project Gutenberg:
Importing data
<- readr::read_lines("http://www.gutenberg.org/cache/epub/10/pg10.txt") bible
Collapse into a string
<- paste(bible, collapse = "\n") bible
Identify the beginning and end of the Bible
Technique borrowed from https://quanteda.io/articles/pkgdown/replication/digital-humanities.html
<- stri_locate_first_fixed(bible, "The First Book of Moses: Called Genesis")[1]
start_v <- stri_locate_last_fixed(bible, "Amen.")[2]
end_v <- stri_sub(bible, start_v, end_v) bible
Split string into books
Every book in the bible is preceded by five newlines, which can be used to split the string into a vector where each element is a book.
<- stri_split_regex(bible, "\n{5}") %>%
books %>%
unlist -40]
.[# Removing the heading "The New Testament of the King James Bible"
Replacing newlines and space
<- str_replace_all(books, "\n{2,}", "NEW_PARAGRAPH") %>%
books str_replace_all("\n", " ") %>%
str_replace_all("NEW_PARAGRAPH", "\n\n")
<- books[3:68] # The two first elements are not books books
Identifying chapters within books
<- str_replace_all(books, "(\\d+:1 )", "NEW_CHAPTER\\1") %>%
chapters stri_split_regex("NEW_CHAPTER")
Remove chapter headings
<- lapply(chapters, function(x) x[-1]) chapters
Retrieve shorter book titles
Retrieve shorter book titles from esv.org to save space in the corpus map plot.
<- read_html("https://www.esv.org/resources/esv-global-study-bible/list-of-abbreviations") %>%
book_titles html_nodes("td:nth-child(1)") %>%
html_text() %>%
13:78] # Removing irrelevant elements after manual inspection. .[
Identify belonging of book
Indicate whether a book belongs to the Old or New Testament.
<- c(rep("Old", 39), rep("New", 27)) testament
# Data frame with one book as one row.
<- tibble::tibble(Text = chapters,
bible_df Book = book_titles,
Testament = testament)
# Each chapter to be one row, but keep the metadata (which book and which testament).
<- tidyr::unnest(bible_df, Text) bible_df
Organise Data
Corpus not organised by date, so date_based_corpus
to FALSE
.
<- prepare_data(dataset = bible_df,
KJB date_based_corpus = FALSE,
grouping_variable = "Book",
columns_doc_info = c("Testament", "Book"))
Explore Corpus
explore(KJB)
Getting Started
Installing and loading the required libraries
The following R packages will be used:
ggforce
tidygraph
ggraph
visNetwork
skimr
tidytext
tidyverse
graphlayouts
jsonlite
Code chunk below will be used to check if these packages have been installed and also will load them into the working R environment.
::p_load(ggforce, tidygraph, ggraph,
pacman
visNetwork, skimr, tidytext, tidyverse, graphlayouts, jsonlite)
Importing JSON File
<- fromJSON("data/MC3.json") mc3_data
Verify data type
class(mc3_data)
[1] "list"
Extract edges
<-
mc3_edges as_tibble(mc3_data$links) %>%
distinct() %>%
mutate(source =
as.character(source),
target =
as.character(target),
type = as.character(type)) %>%
group_by(source,target,type) %>% #to count number of unique links
summarise(weights = n()) %>%
filter(source!=target) %>%
ungroup()
Extract nodes
<-
mc3_nodes as_tibble(mc3_data$nodes) %>%
mutate(country = as.character(country),
id = as.character(id),
product_services = as.character(product_services),
revenue_omu = as.numeric(as.character(revenue_omu)),
type = as.character(type)) %>%
select(id, country, type, revenue_omu, product_services)
Modifying network nodes and edges
<- mc3_edges %>%
id1 select(source) %>%
rename(id = source)
<- mc3_edges %>%
id2 select(target) %>%
rename (id = target)
<- rbind(id1, id2) %>%
mc3_nodes1 distinct() %>%
left_join(mc3_nodes,
unmatched = "drop")
Constructing graph
<- tbl_graph(nodes = mc3_nodes1,
mc3_graph edges = mc3_edges,
directed = FALSE) %>%
mutate(betweeness_centrality =
centrality_betweenness(),
closeness_centrality =
centrality_closeness())
mc3_graph
# A tbl_graph: 37324 nodes and 24036 edges
#
# A bipartite simple graph with 13330 components
#
# Node Data: 37,324 × 7 (active)
id country type revenue_omu product_services betweeness_centrality
<chr> <chr> <chr> <dbl> <chr> <dbl>
1 1 AS Marine… Islian… Comp… NA Scrapbook embel… 6626
2 1 Ltd. Liab… Mawand… Comp… NA Unknown 0
3 1 S.A. de C… Oceanus Comp… NA Unknown 0
4 1 and Sagl … Kondan… Comp… 18529. Total logistics… 1
5 2 Limited L… Marebak Comp… NA Canning, proces… 6
6 2 Limited L… Marebak Comp… NA Unknown 0
7 2 S.A. de C… Oceanus Comp… 12567. Unknown 0
8 3 Coast Sp … Puerto… Comp… NA Unknown 0
9 3 Limited L… Oceanus Comp… 26867. Fibres, yarns, … 0
10 3 Ltd. Liab… Oceanus Comp… 112667. European specia… 0
# ℹ 37,314 more rows
# ℹ 1 more variable: closeness_centrality <dbl>
#
# Edge Data: 24,036 × 4
from to type weights
<int> <int> <chr> <int>
1 1 16060 Company Contacts 1
2 1 16061 Beneficial Owner 1
3 2 16062 Beneficial Owner 1
# ℹ 24,033 more rows
Graph Visualisation
%>%
mc3_graph filter(betweeness_centrality >= 300000) %>%
ggraph(layout = "fr") +
geom_edge_link(aes(alpha = 0.5)) +
geom_node_point(aes(
size = betweeness_centrality,
color = "lightblue",
alpha = 0.5)) +
scale_size_continuous(range=c(1,10)) +
theme_graph()