-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChapter1 Crawling.R
132 lines (99 loc) · 3.72 KB
/
Chapter1 Crawling.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
pkg = c('magrittr', 'quantmod', 'rvest', 'httr', 'jsonlite',
'readr', 'readxl', 'stringr', 'lubridate', 'dplyr',
'tidyr', 'ggplot2', 'corrplot', 'dygraphs',
'highcharter', 'plotly', 'PerformanceAnalytics',
'nloptr', 'quadprog', 'RiskPortfolios', 'cccp',
'timetk', 'broom', 'stargazer', 'timeSeries')
new.pkg = pkg[!(pkg %in% installed.packages()[, "Package"])]
if (length(new.pkg)) {
install.packages(new.pkg, dependencies = TRUE)}
library(httr)
library(rvest)
library(readr)
gen_otp_url =
'http://marketdata.krx.co.kr/contents/COM/GenerateOTP.jspx'
gen_otp_data = list(
name = 'fileDown',
filetype = 'csv',
url = 'MKD/03/0303/03030103/mkd03030103',
tp_cd = 'ALL',
date = '20190612',
lang = 'ko',
pagePath = '/contents/MKD/03/0303/03030103/MKD03030103.jsp')
otp = POST(gen_otp_url, query = gen_otp_data) %>%
read_html() %>%
html_text()
down_url = 'http://file.krx.co.kr/download.jspx'
down_sector = POST(down_url, query = list(code = otp),
add_headers(referer = gen_otp_url)) %>%
read_html() %>%
html_text() %>%
read_csv()
print(down_sector)
ifelse(dir.exists('data'), FALSE, dir.create('data'))
write.csv(down_sector, 'data/krx_sector.csv')
gen_otp_url =
'http://marketdata.krx.co.kr/contents/COM/GenerateOTP.jspx'
gen_otp_data = list(
name = 'fileDown',
filetype = 'csv',
url = "MKD/13/1302/13020401/mkd13020401",
market_gubun = 'ALL',
gubun = '1',
schdate = '20190607',
pagePath = "/contents/MKD/13/1302/13020401/MKD13020401.jsp")
otp = POST(gen_otp_url, query = gen_otp_data) %>%
read_html() %>%
html_text()
down_url = 'http://file.krx.co.kr/download.jspx'
down_ind = POST(down_url, query = list(code = otp),
add_headers(referer = gen_otp_url)) %>%
read_html() %>%
html_text() %>%
read_csv()
print(down_ind)
write.csv(down_ind, 'data/krx_ind.csv')
library(stringr)
url = 'https://finance.naver.com/sise/sise_deposit.nhn'
biz_day = GET(url) %>%
read_html(encoding = 'EUC-KR') %>%
html_nodes(xpath =
'//*[@id="type_1"]/div/ul[2]/li/span') %>%
html_text() %>%
str_match(('[0-9]+.[0-9]+.[0-9]+') ) %>%
str_replace_all('\\.', '')
print(biz_day)
down_sector = read.csv('data/krx_sector.csv', row.names = 1,
stringsAsFactors = FALSE)
down_ind = read.csv('data/krx_ind.csv', row.names = 1,
stringsAsFactors = FALSE)
intersect(names(down_sector), names(down_ind))
setdiff(down_sector[, '종목명'], down_ind[ ,'종목명'])
KOR_ticker = merge(down_sector, down_ind,
by = intersect(names(down_sector),
names(down_ind)),
all = FALSE
)
KOR_ticker = KOR_ticker[order(-KOR_ticker['시가총액.원.']), ]
print(head(KOR_ticker))
KOR_ticker[grepl('스팩', KOR_ticker[, '종목명']), '종목명']
KOR_ticker[str_sub(KOR_ticker[, '종목코드'], -1, -1) != 0, '종목명']
KOR_ticker = KOR_ticker[!grepl('스팩', KOR_ticker[, '종목명']), ]
KOR_ticker = KOR_ticker[str_sub(KOR_ticker[, '종목코드'], -1, -1) == 0, ]
rownames(KOR_ticker) = NULL
write.csv(KOR_ticker, 'data/KOR_ticker.csv')
data_sector = do.call(rbind, data_sector)
sector_code = c('G25', 'G35', 'G50', 'G40', 'G10',
'G20', 'G55', 'G30', 'G15', 'G45')
data_sector = list()
for (i in sector_code) {
url = paste0(
'http://www.wiseindex.com/Index/GetIndexComponets',
'?ceil_yn=0&dt=',biz_day,'&sec_cd=',i)
data = fromJSON(url)
data = data$list
data_sector[[i]] = data
Sys.sleep(1)
}
data_sector = do.call(rbind, data_sector)
write.csv(data_sector, 'data/KOR_sector.csv')