0718 HW03:文字探勘

Goal:分析紅樓夢文本角色出現次數

備註:因為維基百科的人物列表讀取出為簡體字,故把文本轉換成簡體字再讀入。

Step1:安裝package

library(magrittr)
library(tmcn)
## # tmcn Version: 0.2-12
library(NLP)
library(xml2)
library(tm)
library(tmap)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
library(rvest)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)

Step2:設置路徑與讀入文本

setwd("C:/Users/User/Desktop")
library(readr)
## 
## Attaching package: 'readr'
## The following object is masked from 'package:rvest':
## 
##     guess_encoding
REED <- read_file("REED.txt")
REED <- Corpus(VectorSource(REED))
toSpace <- content_transformer(function(x, pattern) {
  return (gsub(pattern, "", x))
})

Step3:文本清理(去空格、標點、數字、停用詞)

REED <- tm_map(REED, stripWhitespace) 
REED <- tm_map(REED, removePunctuation)
REED <- tm_map(REED, removeNumbers)

REED <- tm_map(REED, toSpace, "的")
REED <- tm_map(REED, toSpace, "了")
REED <- tm_map(REED, toSpace, "來")
REED <- tm_map(REED, toSpace, "我")
REED <- tm_map(REED, toSpace, "又")
REED <- tm_map(REED, toSpace, "这")
REED <- tm_map(REED, toSpace, "说")
REED <- tm_map(REED, toSpace, "道")
REED <- tm_map(REED, toSpace, "你")
REED <- tm_map(REED, toSpace, "去")
REED <- tm_map(REED, toSpace, "她")
REED <- tm_map(REED, toSpace, "说")
REED <- tm_map(REED, toSpace, "听")
REED <- tm_map(REED, toSpace, "如今")
REED <- tm_map(REED, toSpace, "有")
REED <- tm_map(REED, toSpace, "在")
REED <- tm_map(REED, toSpace, "來")
REED <- tm_map(REED, toSpace, "是")
REED <- tm_map(REED, toSpace, "也")
REED <- tm_map(REED, toSpace, "都")
REED <- tm_map(REED, toSpace, "来")
REED <- tm_map(REED, toSpace, "不")
REED <- tm_map(REED, toSpace, "便")
REED <- tm_map(REED, toSpace, "就")
REED <- tm_map(REED, toSpace, "得")
REED <- tm_map(REED, toSpace, "们")
REED <- tm_map(REED, toSpace, "个")
REED <- tm_map(REED, toSpace, "他")
REED <- tm_map(REED, toSpace, "呢")
REED <- tm_map(REED, toSpace, "人")
REED <- tm_map(REED, toSpace, "那")
REED <- tm_map(REED, toSpace, "着")
REED <- tm_map(REED, toSpace, "里")
REED <- tm_map(REED, toSpace, "什")
REED <- tm_map(REED, toSpace, "皆")
REED <- tm_map(REED, toSpace, "之")
REED <- tm_map(REED, toSpace, "只")
REED <- tm_map(REED, toSpace, "上")
REED <- tm_map(REED, toSpace, "好")
REED <- tm_map(REED, toSpace, "吃")
REED <- tm_map(REED, toSpace, "要")
REED <- tm_map(REED, toSpace, "一")
REED <- tm_map(REED, toSpace, "见")
REED <- tm_map(REED, toSpace, "家")
REED <- tm_map(REED, toSpace, "笑")
REED <- tm_map(REED, toSpace, "与")
REED <- tm_map(REED, toSpace, "过")
REED <- tm_map(REED, toSpace, "忙")
REED <- tm_map(REED, toSpace, "等")
REED <- tm_map(REED, toSpace, "还")
REED <- tm_map(REED, toSpace, "么")
REED <- tm_map(REED, toSpace, "中")
REED <- tm_map(REED, toSpace, "因")

Step4:使用read_htmlhtml_nodes讀入維基百科的角色列表 ,存為CSV檔並設為worker

path <-"https://zh.wikipedia.org/wiki/%E7%BA%A2%E6%A5%BC%E6%A2%A6%E4%BA%BA%E7%89%A9%E5%88%97%E8%A1%A8"
data <- read_html(path) %>% html_nodes("tr+ tr td:nth-child(1) a") %>% html_text()
data
##   [1] "<U+8D3E>演" "<U+8D3E>源" "<U+8D3E>代化" "<U+8D3E>代善" "<U+8D3E>代儒"
##   [6] "<U+8D3E>代修" "<U+8D3E>敷" "<U+8D3E>敬" "<U+8D3E>赦" "<U+8D3E>政"
##  [11] "<U+8D3E>敏" "<U+8D3E>敕" "<U+8D3E>效" "<U+8D3E>敦" "<U+8D3E>珍"
##  [16] "<U+8D3E><U+740F>" "<U+8D3E>琮" "<U+8D3E>珠" "<U+8D3E><U+5B9D>玉" "<U+8D3E><U+73AF>"
##  [21] "<U+8D3E>瑞" "<U+8D3E>璜" "<U+8D3E>珩" "賈<U+3EDE>" "<U+8D3E>珖"
##  [26] "<U+8D3E>琛" "<U+8D3E><U+743C>" "<U+8D3E>璘" "<U+8D3E>元春" "<U+8D3E>迎春"
##  [31] "<U+8D3E>探春" "<U+8D3E>惜春" "喜<U+9E3E>" "四姐"       "<U+8D3E>蓉"
##  [36] "<U+8D3E><U+5170>" "<U+8D3E><U+8537>" "<U+8D3E>菌" "<U+8D3E>芸" "<U+8D3E>芹"
##  [41] "<U+8D3E>萍" "<U+8D3E>菖" "<U+8D3E>菱" "<U+8D3E>蓁" "<U+8D3E>藻"
##  [46] "<U+8D3E>蘅" "<U+8D3E>芬" "<U+8D3E>芳" "<U+8D3E>芝" "<U+8D3E>荇"
##  [51] "<U+8D3E>芷" "<U+8D3E>葛" "<U+8D3E>巧姐" "史太君"     "史鼐"      
##  [56] "史鼎"       "史湘云"     "王子<U+817E>" "王子胜"     "王夫人"    
##  [61] "薛姨<U+5988>" "王仁"       "王熙<U+51E4>" "薛蟠"       "薛蝌"      
##  [66] "薛<U+5B9D><U+9497>" "薛<U+5B9D>琴" "林黛玉"     "妙玉"       "邢夫人"    
##  [71] "尤氏"       "李<U+7EA8>" "秦可卿"     "<U+8D3E>蓉之妻" "香菱"      
##  [76] "<U+8D75>姨娘" "<U+5218>姥姥" "甄<U+5B9D>玉" "<U+88AD>人" "媚人"      
##  [81] "晴雯"       "<U+7EEE>霰" "麝月"       "檀云"       "秋<U+7EB9>"
##  [86] "碧浪"       "茜雪"       "春燕"       "<U+5760>儿" "四儿"      
##  [91] "佳蕙"       "抱琴"       "司棋"       "<U+83B2>花儿" "<U+7EE3>橘"
##  [96] "待<U+4E66>" "翠墨"       "<U+8749>姐" "入<U+753B>" "彩屏"      
## [101] "紫<U+9E43>" "雪雁"       "春<U+7EA4>" "<U+9E33><U+9E2F>" "琥珀"      
## [106] "珍珠"       "玻璃"       "翡翠"       "鸚鵡"       "靛儿"      
## [111] "傻大姐"     "<U+94F6>蝶" "炒豆儿"     "<U+5350>儿" "<U+83BA>儿"
## [116] "文杏"       "平儿"       "小<U+7EA2>" "丰儿"       "金<U+948F>"
## [121] "玉<U+948F>" "<U+7EE3><U+9E3E>" "<U+7EE3><U+51E4>" "彩云"       "彩霞"      
## [126] "素云"       "同喜"       "同<U+8D35>" "<U+7F15>儿" "翠<U+7F15>"
## [131] "<U+5B9D>珠" "瑞珠"       "姣杏"       "小螺"       "善姐"      
## [136] "臻儿"       "篆儿"       "小吉祥儿"   "小<U+9E4A>" "小舍儿"    
## [141] "<U+5B9D>蟾" "茗<U+70DF>" "焦大"       "李<U+8D35>" "<U+9504><U+836F>"
## [146] "墨雨"       "伴<U+9E64>" "<U+626B>花" "引泉"       "挑芸"      
## [151] "<U+53CC>瑞" "<U+53CC><U+5BFF>" "<U+6765>旺" "<U+5174>儿" "王<U+8363>"
## [156] "<U+94B1><U+542F>" "<U+5F20>若<U+9526>" "<U+8D75>亦<U+534E>" "<U+94B1>槐" "小玄儿"    
## [161] "隆儿"       "昭儿"       "喜儿"       "住儿"       "<U+5BFF>儿"
## [166] "杏奴"       "<U+5E86>儿" "王信"       "芳官"       "<U+9F84>官"
## [171] "蕊官"       "藕官"       "荳官"       "<U+5B9D>官" "文官"      
## [176] "茄官"       "菂官"       "艾官"       "玉官"       "葵官"      
## [181] "茫茫大士"   "渺渺真人"   "空空道人"   "甄士<U+9690>" "封氏"      
## [186] "小童"       "神瑛侍者"   "<U+7EDB>珠仙子" "警幻仙子"   "<U+8D3E>雨村"
## [191] "<U+4E25>老<U+7237>" "霍<U+542F>" "封<U+8083>" "冷子<U+5174>" "林如海"    
## [196] "李<U+5B37><U+5B37>" "王<U+5B37><U+5B37>" "<U+95E8>子" "李守中"     "<U+51AF><U+6E0A>"
## [201] "拐子"       "痴<U+68A6>仙姑" "引愁金女"   "种情大士"   "度恨菩提"  
## [206] "王成"       "<U+5218>氏" "板儿"       "青儿"       "周瑞"      
## [211] "周瑞家的"   "智能"       "余信"       "余信家的"   "秦<U+949F>"
## [216] "<U+8D56>二" "詹光"       "戴良"       "<U+94B1><U+534E>" "<U+5355>聘仁"
## [221] "<U+5434>新登" "秦<U+4E1A>" "胡氏"       "金氏"       "<U+51AF>唐"
## [226] "<U+5F20>友士" "戴<U+6743>" "<U+5F20>材家的" "牛清"       "牛<U+7EE7>宗"
## [231] "柳彪"       "柳芳"       "<U+9648>翼" "<U+9648>瑞文" "<U+9A6C>魁"
## [236] "<U+9A6C>尚" "侯<U+6653>明" "侯孝康"     "石光珠"     "<U+848B>子宁"
## [241] "<U+8C22><U+9CB8>" "戚建<U+8F89>" "裘良"       "<U+51AF>紫英" "<U+9648>也俊"
## [246] "<U+536B>若<U+5170>" "水溶"       "二丫<U+5934>" "<U+51C0><U+865A>" "智善"      
## [251] "胡老<U+7237>" "金哥"       "李公子"     "云光"       "夏守忠"    
## [256] "<U+8D56>大" "<U+8D75><U+5B37><U+5B37>" "<U+5434>天佑" "<U+5434><U+8D35>妃" "卜固修"    
## [261] "山子野"     "林之孝"     "程日<U+5174>" "昭容"       "彩<U+7F24>"
## [266] "花母"       "花自芳"     "多官"       "多姑娘"     "王嫂子"    
## [271] "周氏"       "卜世仁"     "<U+94F6>姐" "倪二"       "王短腿"    
## [276] "林之孝家的" "方椿"       "<U+9A6C>道婆" "周姨娘"     "胡斯<U+6765>"
## [281] "<U+9C8D>太<U+533B>" "王<U+6D4E>仁" "<U+848B>玉菡" "云儿"       "<U+5F20>道士"
## [286] "周奶娘"     "傅<U+8BD5>" "傅秋芳"     "宋<U+5B37><U+5B37>" "茗玉"      
## [291] "王君效"     "<U+8D56>大的母<U+4EB2>" "<U+9C8D>二家的" "金彩"       "金文翔"    
## [296] "嫣<U+7EA2>" "柳湘<U+83B2>" "<U+8D56>尚<U+8363>" "邢岫<U+70DF>" "邢忠"      
## [301] "李<U+5A76>娘" "李<U+7EB9>" "李<U+7EEE>" "梅翰林"     "胡君<U+8363>"
## [306] "良儿"       "<U+4E4C><U+8FDB>孝" "<U+5A04>氏" "女先儿"     "<U+5355>大良"
## [311] "<U+8D75><U+56FD>基" "<U+5355>大娘" "祝<U+5988>" "田<U+5988>" "<U+53F6><U+5988>"
## [316] "<U+8BB8>氏" "何婆子"     "小<U+9E20>儿" "夏婆子"     "柳家的"    
## [321] "柳五儿"     "秦<U+663E>家的" "佩<U+51E4>" "偕<U+9E3E>" "尤二姐"    
## [326] "尤三姐"     "尤老娘"     "<U+5F20><U+534E>" "俞<U+7984>" "秋桐"      
## [331] "天文生"     "潘又安"     "朱大娘"     "周太<U+76D1>" "小霞"      
## [336] "翠云"       "<U+6765>喜家的" "王善保家的" "<U+5F20><U+5988>" "邢德全"    
## [341] "文花"       "<U+5706>信" "智通"       "<U+5B59><U+7ECD>祖" "夏金桂"    
## [346] "夏奶奶"     "王一<U+8D34>" "賈寶玉"     "<U+8D3E>政" "<U+8D3E>赦"
## [351] "<U+8D3E><U+740F>" "賈珍"       "<U+8D3E><U+73AF>" "薛蟠"       "賈母"      
## [356] "王夫人"     "薛姨<U+5988>" "尤氏"       "平儿"       "<U+8D75>姨娘"
## [361] "<U+9E33><U+9E2F>" "襲人"       "晴雯"       "香菱"       "紫<U+9E43>"
## [366] "麝月"       "小<U+7EA2>" "金<U+948F>" "<U+9F84>官" "甄士<U+9690>"
## [371] "<U+8D3E>雨村" "劉姥姥"     "其他"
data <- as_data_frame(data)
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
readr::write_csv(data, "Names.csv")
names <- readr::read_csv("Names.csv")
## Parsed with column specification:
## cols(
##   value = col_character()
## )
seg <- worker(bylines = F, symbol = T,
              user = "Names.csv")

Step5:斷詞

mixseg = worker()
jieba_tokenizer=function(d){
unlist(segment(d[[1]],mixseg))
}

seg = lapply(REED, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))

Step6:文字雲製作

par(family=("Heiti TC Light"))
wordcloud(freqFrame$Var1,freqFrame$Freq,
scale=c(5,0.1),min.freq=30,max.words=150,
random.order=TRUE, random.color=FALSE,
rot.per=.1, colors=brewer.pal(8, "Dark2"),
ordered.colors=FALSE,use.r.layout=FALSE,
fixed.asp=TRUE)

Step7:顯示各字出現字數

mixseg = worker()
jieba_tokenizer=function(d){
  unlist(segment(d[[1]],mixseg))
}
seg = lapply(REED, jieba_tokenizer)
freqFrame = as.data.frame(table(unlist(seg)))
freqFrame = freqFrame[order(freqFrame$Freq,decreasing=TRUE), ]
library(knitr)
kable(head(freqFrame), format = "markdown")
Var1 Freq
1964 352
2346 294
689 宝玉 255
1177 232
1485 228
232 165

Result&Problem:做出文字雲與文字出現次數,但無法呈現角色出現多寡(只有寶玉和劉姥姥等名字),推測是因為: 1.其他字數量相對較多 2.小說中人物大多以二字呈現

Probable solution:修改名單、找到直接呈現名字出現次數的code寫法