Step1:安裝package
library(magrittr)
library(tmcn)
## # tmcn Version: 0.2-12
library(NLP)
library(xml2)
library(tm)
library(tmap)
library(jiebaRD)
library(jiebaR)
library(RColorBrewer)
library(wordcloud)
library(rvest)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
Step3:文本清理(去空格、標點、數字、停用詞)
REED <- tm_map(REED, stripWhitespace)
REED <- tm_map(REED, removePunctuation)
REED <- tm_map(REED, removeNumbers)
REED <- tm_map(REED, toSpace, "的")
REED <- tm_map(REED, toSpace, "了")
REED <- tm_map(REED, toSpace, "來")
REED <- tm_map(REED, toSpace, "我")
REED <- tm_map(REED, toSpace, "又")
REED <- tm_map(REED, toSpace, "这")
REED <- tm_map(REED, toSpace, "说")
REED <- tm_map(REED, toSpace, "道")
REED <- tm_map(REED, toSpace, "你")
REED <- tm_map(REED, toSpace, "去")
REED <- tm_map(REED, toSpace, "她")
REED <- tm_map(REED, toSpace, "说")
REED <- tm_map(REED, toSpace, "听")
REED <- tm_map(REED, toSpace, "如今")
REED <- tm_map(REED, toSpace, "有")
REED <- tm_map(REED, toSpace, "在")
REED <- tm_map(REED, toSpace, "來")
REED <- tm_map(REED, toSpace, "是")
REED <- tm_map(REED, toSpace, "也")
REED <- tm_map(REED, toSpace, "都")
REED <- tm_map(REED, toSpace, "来")
REED <- tm_map(REED, toSpace, "不")
REED <- tm_map(REED, toSpace, "便")
REED <- tm_map(REED, toSpace, "就")
REED <- tm_map(REED, toSpace, "得")
REED <- tm_map(REED, toSpace, "们")
REED <- tm_map(REED, toSpace, "个")
REED <- tm_map(REED, toSpace, "他")
REED <- tm_map(REED, toSpace, "呢")
REED <- tm_map(REED, toSpace, "人")
REED <- tm_map(REED, toSpace, "那")
REED <- tm_map(REED, toSpace, "着")
REED <- tm_map(REED, toSpace, "里")
REED <- tm_map(REED, toSpace, "什")
REED <- tm_map(REED, toSpace, "皆")
REED <- tm_map(REED, toSpace, "之")
REED <- tm_map(REED, toSpace, "只")
REED <- tm_map(REED, toSpace, "上")
REED <- tm_map(REED, toSpace, "好")
REED <- tm_map(REED, toSpace, "吃")
REED <- tm_map(REED, toSpace, "要")
REED <- tm_map(REED, toSpace, "一")
REED <- tm_map(REED, toSpace, "见")
REED <- tm_map(REED, toSpace, "家")
REED <- tm_map(REED, toSpace, "笑")
REED <- tm_map(REED, toSpace, "与")
REED <- tm_map(REED, toSpace, "过")
REED <- tm_map(REED, toSpace, "忙")
REED <- tm_map(REED, toSpace, "等")
REED <- tm_map(REED, toSpace, "还")
REED <- tm_map(REED, toSpace, "么")
REED <- tm_map(REED, toSpace, "中")
REED <- tm_map(REED, toSpace, "因")
Step4:使用read_html
即html_nodes
讀入維基百科的角色列表 ,存為CSV檔並設為worker
path <-"https://zh.wikipedia.org/wiki/%E7%BA%A2%E6%A5%BC%E6%A2%A6%E4%BA%BA%E7%89%A9%E5%88%97%E8%A1%A8"
data <- read_html(path) %>% html_nodes("tr+ tr td:nth-child(1) a") %>% html_text()
data
## [1] "<U+8D3E>演" "<U+8D3E>源" "<U+8D3E>代化" "<U+8D3E>代善" "<U+8D3E>代儒"
## [6] "<U+8D3E>代修" "<U+8D3E>敷" "<U+8D3E>敬" "<U+8D3E>赦" "<U+8D3E>政"
## [11] "<U+8D3E>敏" "<U+8D3E>敕" "<U+8D3E>效" "<U+8D3E>敦" "<U+8D3E>珍"
## [16] "<U+8D3E><U+740F>" "<U+8D3E>琮" "<U+8D3E>珠" "<U+8D3E><U+5B9D>玉" "<U+8D3E><U+73AF>"
## [21] "<U+8D3E>瑞" "<U+8D3E>璜" "<U+8D3E>珩" "賈<U+3EDE>" "<U+8D3E>珖"
## [26] "<U+8D3E>琛" "<U+8D3E><U+743C>" "<U+8D3E>璘" "<U+8D3E>元春" "<U+8D3E>迎春"
## [31] "<U+8D3E>探春" "<U+8D3E>惜春" "喜<U+9E3E>" "四姐" "<U+8D3E>蓉"
## [36] "<U+8D3E><U+5170>" "<U+8D3E><U+8537>" "<U+8D3E>菌" "<U+8D3E>芸" "<U+8D3E>芹"
## [41] "<U+8D3E>萍" "<U+8D3E>菖" "<U+8D3E>菱" "<U+8D3E>蓁" "<U+8D3E>藻"
## [46] "<U+8D3E>蘅" "<U+8D3E>芬" "<U+8D3E>芳" "<U+8D3E>芝" "<U+8D3E>荇"
## [51] "<U+8D3E>芷" "<U+8D3E>葛" "<U+8D3E>巧姐" "史太君" "史鼐"
## [56] "史鼎" "史湘云" "王子<U+817E>" "王子胜" "王夫人"
## [61] "薛姨<U+5988>" "王仁" "王熙<U+51E4>" "薛蟠" "薛蝌"
## [66] "薛<U+5B9D><U+9497>" "薛<U+5B9D>琴" "林黛玉" "妙玉" "邢夫人"
## [71] "尤氏" "李<U+7EA8>" "秦可卿" "<U+8D3E>蓉之妻" "香菱"
## [76] "<U+8D75>姨娘" "<U+5218>姥姥" "甄<U+5B9D>玉" "<U+88AD>人" "媚人"
## [81] "晴雯" "<U+7EEE>霰" "麝月" "檀云" "秋<U+7EB9>"
## [86] "碧浪" "茜雪" "春燕" "<U+5760>儿" "四儿"
## [91] "佳蕙" "抱琴" "司棋" "<U+83B2>花儿" "<U+7EE3>橘"
## [96] "待<U+4E66>" "翠墨" "<U+8749>姐" "入<U+753B>" "彩屏"
## [101] "紫<U+9E43>" "雪雁" "春<U+7EA4>" "<U+9E33><U+9E2F>" "琥珀"
## [106] "珍珠" "玻璃" "翡翠" "鸚鵡" "靛儿"
## [111] "傻大姐" "<U+94F6>蝶" "炒豆儿" "<U+5350>儿" "<U+83BA>儿"
## [116] "文杏" "平儿" "小<U+7EA2>" "丰儿" "金<U+948F>"
## [121] "玉<U+948F>" "<U+7EE3><U+9E3E>" "<U+7EE3><U+51E4>" "彩云" "彩霞"
## [126] "素云" "同喜" "同<U+8D35>" "<U+7F15>儿" "翠<U+7F15>"
## [131] "<U+5B9D>珠" "瑞珠" "姣杏" "小螺" "善姐"
## [136] "臻儿" "篆儿" "小吉祥儿" "小<U+9E4A>" "小舍儿"
## [141] "<U+5B9D>蟾" "茗<U+70DF>" "焦大" "李<U+8D35>" "<U+9504><U+836F>"
## [146] "墨雨" "伴<U+9E64>" "<U+626B>花" "引泉" "挑芸"
## [151] "<U+53CC>瑞" "<U+53CC><U+5BFF>" "<U+6765>旺" "<U+5174>儿" "王<U+8363>"
## [156] "<U+94B1><U+542F>" "<U+5F20>若<U+9526>" "<U+8D75>亦<U+534E>" "<U+94B1>槐" "小玄儿"
## [161] "隆儿" "昭儿" "喜儿" "住儿" "<U+5BFF>儿"
## [166] "杏奴" "<U+5E86>儿" "王信" "芳官" "<U+9F84>官"
## [171] "蕊官" "藕官" "荳官" "<U+5B9D>官" "文官"
## [176] "茄官" "菂官" "艾官" "玉官" "葵官"
## [181] "茫茫大士" "渺渺真人" "空空道人" "甄士<U+9690>" "封氏"
## [186] "小童" "神瑛侍者" "<U+7EDB>珠仙子" "警幻仙子" "<U+8D3E>雨村"
## [191] "<U+4E25>老<U+7237>" "霍<U+542F>" "封<U+8083>" "冷子<U+5174>" "林如海"
## [196] "李<U+5B37><U+5B37>" "王<U+5B37><U+5B37>" "<U+95E8>子" "李守中" "<U+51AF><U+6E0A>"
## [201] "拐子" "痴<U+68A6>仙姑" "引愁金女" "种情大士" "度恨菩提"
## [206] "王成" "<U+5218>氏" "板儿" "青儿" "周瑞"
## [211] "周瑞家的" "智能" "余信" "余信家的" "秦<U+949F>"
## [216] "<U+8D56>二" "詹光" "戴良" "<U+94B1><U+534E>" "<U+5355>聘仁"
## [221] "<U+5434>新登" "秦<U+4E1A>" "胡氏" "金氏" "<U+51AF>唐"
## [226] "<U+5F20>友士" "戴<U+6743>" "<U+5F20>材家的" "牛清" "牛<U+7EE7>宗"
## [231] "柳彪" "柳芳" "<U+9648>翼" "<U+9648>瑞文" "<U+9A6C>魁"
## [236] "<U+9A6C>尚" "侯<U+6653>明" "侯孝康" "石光珠" "<U+848B>子宁"
## [241] "<U+8C22><U+9CB8>" "戚建<U+8F89>" "裘良" "<U+51AF>紫英" "<U+9648>也俊"
## [246] "<U+536B>若<U+5170>" "水溶" "二丫<U+5934>" "<U+51C0><U+865A>" "智善"
## [251] "胡老<U+7237>" "金哥" "李公子" "云光" "夏守忠"
## [256] "<U+8D56>大" "<U+8D75><U+5B37><U+5B37>" "<U+5434>天佑" "<U+5434><U+8D35>妃" "卜固修"
## [261] "山子野" "林之孝" "程日<U+5174>" "昭容" "彩<U+7F24>"
## [266] "花母" "花自芳" "多官" "多姑娘" "王嫂子"
## [271] "周氏" "卜世仁" "<U+94F6>姐" "倪二" "王短腿"
## [276] "林之孝家的" "方椿" "<U+9A6C>道婆" "周姨娘" "胡斯<U+6765>"
## [281] "<U+9C8D>太<U+533B>" "王<U+6D4E>仁" "<U+848B>玉菡" "云儿" "<U+5F20>道士"
## [286] "周奶娘" "傅<U+8BD5>" "傅秋芳" "宋<U+5B37><U+5B37>" "茗玉"
## [291] "王君效" "<U+8D56>大的母<U+4EB2>" "<U+9C8D>二家的" "金彩" "金文翔"
## [296] "嫣<U+7EA2>" "柳湘<U+83B2>" "<U+8D56>尚<U+8363>" "邢岫<U+70DF>" "邢忠"
## [301] "李<U+5A76>娘" "李<U+7EB9>" "李<U+7EEE>" "梅翰林" "胡君<U+8363>"
## [306] "良儿" "<U+4E4C><U+8FDB>孝" "<U+5A04>氏" "女先儿" "<U+5355>大良"
## [311] "<U+8D75><U+56FD>基" "<U+5355>大娘" "祝<U+5988>" "田<U+5988>" "<U+53F6><U+5988>"
## [316] "<U+8BB8>氏" "何婆子" "小<U+9E20>儿" "夏婆子" "柳家的"
## [321] "柳五儿" "秦<U+663E>家的" "佩<U+51E4>" "偕<U+9E3E>" "尤二姐"
## [326] "尤三姐" "尤老娘" "<U+5F20><U+534E>" "俞<U+7984>" "秋桐"
## [331] "天文生" "潘又安" "朱大娘" "周太<U+76D1>" "小霞"
## [336] "翠云" "<U+6765>喜家的" "王善保家的" "<U+5F20><U+5988>" "邢德全"
## [341] "文花" "<U+5706>信" "智通" "<U+5B59><U+7ECD>祖" "夏金桂"
## [346] "夏奶奶" "王一<U+8D34>" "賈寶玉" "<U+8D3E>政" "<U+8D3E>赦"
## [351] "<U+8D3E><U+740F>" "賈珍" "<U+8D3E><U+73AF>" "薛蟠" "賈母"
## [356] "王夫人" "薛姨<U+5988>" "尤氏" "平儿" "<U+8D75>姨娘"
## [361] "<U+9E33><U+9E2F>" "襲人" "晴雯" "香菱" "紫<U+9E43>"
## [366] "麝月" "小<U+7EA2>" "金<U+948F>" "<U+9F84>官" "甄士<U+9690>"
## [371] "<U+8D3E>雨村" "劉姥姥" "其他"
data <- as_data_frame(data)
## Warning: `as_data_frame()` is deprecated, use `as_tibble()` (but mind the new semantics).
## This warning is displayed once per session.
readr::write_csv(data, "Names.csv")
names <- readr::read_csv("Names.csv")
## Parsed with column specification:
## cols(
## value = col_character()
## )
seg <- worker(bylines = F, symbol = T,
user = "Names.csv")