-
Notifications
You must be signed in to change notification settings - Fork 1
/
citation_matrix.R
233 lines (197 loc) · 9.02 KB
/
citation_matrix.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
library('bibliometrix')
# For R 4.01 on Mac, earlier versions of dplyr
# Appears to be related to this: https://github.com/tidyverse/dplyr/issues/5017
# For now can be fixed using this:
# devtools::install_version("dplyr", version = "0.8.5", repos = "http://cran.us.r-project.org")
library('tidyverse')
# Functions ----
# Load the manual name change document, used by clean.name function
modified.authors <- read_csv(file = "data/processed/name_changes.csv", col_names = c("old.name", "new.name", "comment")) %>%
select(old.name, new.name)
# Function to clean up names, by shortening them and handling special cases in a manual
# name change document (name_changes.csv). This ensures multiple instances of a name
# are all coded in a common format. The symbol "NA" is used for names to exclude.
#
# Input:
# `x` is a string with a full name of an author, ideally in the form LASTNAME FIRSTNAME
# Returns:
# A string with a shortened version of that name, for most cases as LASTNAME INITIAL
#
# Examples:
# HEIDEGGER MARTIN -> HEIDEGGER M
# GUSSERL -> HUSSERL E
# ALCOHOLICS ANONYMOUS -> NA
#
# Feel free to add comments to the manual name change document
# https://docs.google.com/spreadsheets/d/14aVjy5vOlNBvnlhcSt5rDbX4conwx4bP6EL1CQkyL2c/edit?usp=sharing
clean.name <- function(x) {
# Ignore NAs
if (is.na(x)) {
return(NA)
}
if(x == "NA"){return(NA)}
# Manual name changes.
if(x %in% modified.authors$old.name){
name.ind <- which(modified.authors$old.name == x)
# print(x)
return(modified.authors$new.name[name.ind])
}
else{
# The main algorithm applied to all authors Special cases are above.
# Extracts the longest string followed by a space and then the
# first letter of the next string
return(str_extract(x, "^[A-Z]+[[:space:]]*[[A-Z]]{1}"))
}
}
# Load and parse articles ------
# Load raw.articles. View it to see main data.
# AU is citing author
# CR is citations in the article
# CR_AU is first author listed in each citation. I.e cited author
citations_file = "data/processed/raw_articles.rds"
if (file.exists(citations_file)) {
raw.articles <- read_rds(citations_file)
} else {
files <- c('data/WoS/citations1.txt', 'data/WoS/citations2.txt',
'data/WoS/citations3.txt', 'data/WoS/citations4.txt',
'data/WoS/citations5.txt', 'data/WoS/citations6.txt',
'data/WoS/citations7.txt', 'data/WoS/citations8.txt',
'data/WoS/citations9.txt', 'data/WoS/citations10.txt',
'data/WoS/citations11.txt', 'data/WoS/citations12.txt')
raw.articles <- convert2df(files,dbsource = "isi", format = "plaintext")
write_rds(raw.articles, citations_file)
}
# Pulls the cited authors from CR and adds that CR_AU to a new table
parsed.articles <- metaTagExtraction(M = raw.articles, Field = "CR_AU")
# Hegel Check
PS = "PHENOMENOLOGY OF SPIRIT|PHENOMENOLOGY OF MIND|DES GEISTES"
hegel <- articles %>%
filter(str_detect(AB, PS) | str_detect(TI, PS) | str_detect(DE, PS)) %>%
select(AU, TI)
parsed.articles <- filter(parsed.articles, !(TI %in% hegel$TI))
# Create and consolidate citation matrix ------
# An article-by-cited-author matrix.
# Each cell is a count of how many times an article (row) cited an author (column)
citing.matrix <- cocMatrix(parsed.articles, Field = "CR_AU", type = "matrix", sep = ";") %>%
as_tibble()
# Wrangle the data -----
# Parse out the original authors (AU) and save only the first author in multi-authored pieces
first.author <- map_chr(strsplit(parsed.articles$AU, ';'), function(x){return(x[1])})
# Add first author as first column
citing.matrix <- citing.matrix %>%
mutate(first.author = first.author) %>%
select(first.author, everything()) # Re-order so first.author is first column
# Initial data cleanup
citing.matrix <- citing.matrix %>%
# Alphabetize by first.author
arrange(first.author) %>%
# Filter out rows
filter(first.author != "NA", !str_detect(first.author,"ANONYMOUS"), !(is.na(first.author))) %>%
# Removing columns with numbers
select(-matches("[[:digit:]]"))
# Shorten main author names
short.cited <- colnames(citing.matrix)[-1] %>% # Don't use the first column name, which is "first.author"
map_chr(clean.name) # Returns a vector of strings with the shortened names in the columns
short.cited <- short.cited[!(is.na(short.cited))]
# Shorten citing authors too so we can align authors and citing authors
old.citing <- citing.matrix$first.author
short.citing <- map_chr(citing.matrix$first.author, clean.name)
short.citing <- short.citing[!is.na(short.citing)]
citing.matrix <- citing.matrix %>%
mutate(first.author = short.citing)
# Consolidate the columns
# Number of rows is same as citing matrix.
# Number of columns is number of unique cited authors
citing.matrix.clean <- matrix(
0,
nrow = nrow(citing.matrix),
ncol = length(unique(short.cited)),
dimnames = list(citing.matrix$first.author, unique(short.cited))
)
# Transform df into matrix. Temporary table for column consolidation
temp.citing.matrix <- citing.matrix[,-1] %>%
as.matrix()
row.names(temp.citing.matrix) <- citing.matrix$first.author
# Now consolidate columns of the citing matrix
# Manually does for columns what the row consolidation does below for row
for(name in sort(colnames(citing.matrix)[-1])){
# Loop through the names of the cited authors sorted alphabetically
# Get short version of name
short.name <- clean.name(name)
# print(name)
# print(short.name)
# Sum old column with short name in new matrix.
# this ends up summing multiple versions of the same
# author in the same column on the new matrix
# (v.gr. "ADORNO", "ADORNO T" "ADORNO TW" "ADORNO T W" etc)
if(!(is.na(short.name))){
new.vector <- citing.matrix.clean[, short.name] + temp.citing.matrix[, name]
citing.matrix.clean[, short.name] <- new.vector
}
}
# Cleanup: Remove any cited author with less than 1 citation
# (that is, only 1 citing author cited them only 1 time)
# and articles that have only 1 citation (probably noise)
citing.matrix.clean <- citing.matrix.clean[,colSums(citing.matrix.clean) > 1]
citing.matrix.clean <- citing.matrix.clean[rowSums(citing.matrix.clean) > 1,]
# Ignore NA
citing.matrix.clean <- citing.matrix.clean[!(is.na(row.names(citing.matrix.clean))),]
citing.matrix.clean <- citing.matrix.clean[row.names(citing.matrix.clean) != "NA",]
# Sort cited authors
citing.matrix.clean <- citing.matrix.clean[,sort(colnames(citing.matrix.clean))]
# For use in snippets.R
# write_rds(citing.matrix.clean, "citing_matrix.rds")
# Save for co-citation before consolidating citing authors
co.citation.matrix <- citing.matrix.clean
# Remove raw citing for memory
rm(citing.matrix)
citing.matrix.clean <- as_tibble(citing.matrix.clean) %>%
add_column(first.author = rownames(citing.matrix.clean))
# Consolidate redundant rows introduced by shortening names
# E.g. sum all "ALEXANDER R" rows into one row
citing.matrix.clean <- citing.matrix.clean %>%
group_by(first.author) %>%
summarise_all(sum)
# Make edge lists ------
# Now make the edge list for the network.
# This is a directed edge list, so it's a "long" (vs "wide") version of the matrix.
# See https://data.ca.gov/uploads/page_images/2019-08-28-194741.855848DataPreplong-wide.jpg
# Column 1 is CITING author, column 2 is CITED author, column 3 is number of time CITING author cited CITED author.
# complete.citing.authors <- read.delim("complete_matrix_rownames.txt", header = FALSE)
# Gather turns a wide data frame into a long data frame
complete.edge.list <- gather(citing.matrix.clean, "Target", "Weight", -first.author) %>%
rename(Source = first.author) %>%
filter(Weight > 0)
# Write the data ----
write_csv(complete.edge.list, "data/processed/complete_edge_list.csv")
# Make co-citation matrix -----
# change to binary to count occurrences
co.citation.matrix[co.citation.matrix > 0] <- 1
# Transpose
co.citation.matrix <- t(co.citation.matrix)
# Next, multiply by its transposed.
# tcrossprod(x) = x %*% t(x)
co.citation.matrix <- tcrossprod(co.citation.matrix)
# This is a citing author x citing author matrix.
# Single co-citations (cell = 1) is probably noise, so get rid of those (replace with 0).
co.citation.matrix[co.citation.matrix == 1] <- 0
# Set the diagonal to 0, it's meaningless.
diag(co.citation.matrix) <- 0
# Clean rows and columns with only 0s
co.citation.matrix <- co.citation.matrix[rowSums(co.citation.matrix) > 0,]
co.citation.matrix <- co.citation.matrix[,colSums(co.citation.matrix) > 0]
# Count the number of non-zero elements per row and column.
# If an author only co-appears with one other author, probably noise.
cutoff <- 1
non.zero <- apply(co.citation.matrix, 1, function(row){
number.non.zero <- sum(row > 0)
return(ifelse(number.non.zero <= cutoff, FALSE, TRUE))
return(sum(row > 0))
})
co.citation.matrix <- co.citation.matrix[non.zero, non.zero]
# Make and save edge list
co.citation.edge.list <- co.citation.matrix %>%
as_tibble(rownames = "Source") %>%
gather("Target", "Weight", -Source) %>%
filter(Weight > 0)
write_csv(co.citation.edge.list, "data/processed/cocitation_edge_list.csv")