########################################################
#-------------------------------------------------------
# Topic:模拟浏览器访问alibaba爬虫
# Author:
# Date:Sun Mar 08 19:00:35 2020
# Mail:
#-------------------------------------------------------
########################################################
#-------------------------------------------------------
#Function1:从百度进入Alibaba的主页
#-------------------------------------------------------
library(RSelenium)
library(rvest)
library(stringr)
remDr
remDr$open()
url
remDr$navigate(url)
search_box
search_box
remDr$mouseMoveToLocation(webElement = search_box)
search_box$sendKeysToElement(list("阿里巴巴",key="enter"))
url_dir
url_dir
remDr$mouseMoveToLocation(webElement = url_dir)
remDr$click()
#-------------------------------------------------------
#Function2:使用alibaba模拟检索
#-------------------------------------------------------
myswitch(remDr,remDr$getWindowHandles()[[2]])
remDr$getCurrentUrl()
ali_search
ali_search
ali_search$sendKeysToElement(list("25mm mink eyelashes",key="enter"))
list_vis
list_vis
remDr$mouseMoveToLocation(webElement = list_vis)
remDr$click()
#-------------------------------------------------------
#Function:对检索到的信息进行爬取
#-------------------------------------------------------
#使用revest爬取信息
library(rvest)
info_list
price
ranges
detail_url_dir
input_num
input_go
sub_dir_url='//a[contains(@class,"organic-gallery-title")]'
#current_url1
remDr$navigate("https://www.alibaba.com/products/25mm_mink_eyelashes.html?spm=a2700.galleryofferlist.0.0.3615221bsbyKgv&IndexArea=product_en&viewtype=G")
detail_info_list
information_all
sub_dir
url_all
for (i in 1:100) {
#输入页码进入网页
input_num1
input_num1$sendKeysToElement(list(as.character(i)))
input_go1
remDr$mouseMoveToLocation(webElement = input_go1)
remDr$click()
#页面滑动功能
last_height = 0
repeat {
remDr$executeScript("window.scrollTo(0,document.body.scrollHeight);", list(remDr$findElement("css","body")))
Sys.sleep(1)
new_height=remDr$executeScript("return document.body.scrollHeight", list(remDr$findElement("css","body")))
if(unlist(last_height) == unlist(new_height)) {break} else
{last_height = new_height}
}
Sys.sleep(3)
#解析网页
web
print(i)
information
information_all[[i]]
names(information_all)[i]
sub_dir1
sub_dir1
sub_dir1
sub_dir
url_all[[i]]
names(url_all)[[i]]
#这里原本计划爬取每一个产品的细节信息,由于动态页面加载时间太长(网速太慢),故放弃
# current_url
# detail_url
# detail_url
# detail_url
# detail_info_list[[i]]
# names(detail_info_list)[[i]]
# for (j in 1:length(detail_url)) {
#
# remDr$navigate(detail_url[j])
# Sys.sleep(10)
# web
# detail_info
# detail_info
# if(length(detail_info)==0){
# detail_info="non_detected"
# }else{detail_info
# detail_info_list[[i]][[j]]
# names(detail_info_list[[i]])[j]
# print(j)
# }
# remDr$navigate(current_url[[1]])
Sys.sleep(5)
}
#-------------------------------------------------------
#Function4:对结果进行整理及保存
#-------------------------------------------------------
information_all$page1[[1]]
information_all$page1
information_all$page1
information
for (i in 1:100) {
information1
information1
colnames(information1)
information
}
information2
information$price
information$unit
information
information[,c(4,5)]
information[1:6,1:5]
colnames(information)[c(4,5)]
information$url
colnames(information)
information
head(information)
library(xlsx)
write.xlsx(information,"information.xlsx",row.names = F)