|
|
@@ -1,177 +0,0 @@
|
|
|
-package cn.reghao.bnt.web.parser;
|
|
|
-
|
|
|
-import cn.reghao.jutil.jdk.http.WebRequest;
|
|
|
-import cn.reghao.jutil.jdk.http.WebResponse;
|
|
|
-import cn.reghao.jutil.jdk.serializer.JsonConverter;
|
|
|
-import cn.reghao.jutil.jdk.text.TextFile;
|
|
|
-import cn.reghao.jutil.tool.http.DefaultWebRequest;
|
|
|
-import cn.reghao.bnt.browser.chrome.ReqMatcher;
|
|
|
-import cn.reghao.bnt.spider.url.BodyDataType;
|
|
|
-import cn.reghao.bnt.spider.url.Site;
|
|
|
-import cn.reghao.bnt.web.parser.db.mongo.UnparsedDataMongo;
|
|
|
-import cn.reghao.bnt.web.parser.db.mongo.UrlResourceMongo;
|
|
|
-import cn.reghao.bnt.web.parser.model.po.UnparsedData;
|
|
|
-import cn.reghao.bnt.web.parser.model.po.UrlResource;
|
|
|
-import cn.reghao.bnt.web.parser.site.bilibili.BiliCommentDataParser;
|
|
|
-import cn.reghao.bnt.web.parser.site.bilibili.db.mongo.BiliVideoMongo;
|
|
|
-import cn.reghao.bnt.web.parser.site.bilibili.model.po.BiliComment;
|
|
|
-import cn.reghao.bnt.web.parser.site.bilibili.model.po.BiliVideo;
|
|
|
-import cn.reghao.bnt.web.parser.task.DataProducer;
|
|
|
-import com.google.gson.JsonObject;
|
|
|
-import lombok.extern.slf4j.Slf4j;
|
|
|
-import org.junit.Test;
|
|
|
-import org.junit.runner.RunWith;
|
|
|
-import org.springframework.beans.factory.annotation.Autowired;
|
|
|
-import org.springframework.boot.SpringApplication;
|
|
|
-import org.springframework.boot.test.context.SpringBootTest;
|
|
|
-import org.springframework.test.context.ActiveProfiles;
|
|
|
-import org.springframework.test.context.junit4.SpringRunner;
|
|
|
-
|
|
|
-import java.util.*;
|
|
|
-
|
|
|
-/**
|
|
|
- * @author reghao
|
|
|
- * @date 2021-03-20 18:02:38
|
|
|
- */
|
|
|
-@Slf4j
|
|
|
-@ActiveProfiles("dev")
|
|
|
-@SpringBootTest(classes = SpringApplication.class)
|
|
|
-@RunWith(SpringRunner.class)
|
|
|
-public class BiliCommentSpiderTest {
|
|
|
- //*****************************************************************************************************************
|
|
|
- // 视频评论
|
|
|
- //*****************************************************************************************************************
|
|
|
- @Autowired
|
|
|
- BiliCommentDataParser biliCommentDataParser;
|
|
|
- @Test
|
|
|
- public void biliCommentTest() {
|
|
|
- Set<ReqMatcher> set = new HashSet<>();
|
|
|
- String pattern1 = "reply/wbi/main";
|
|
|
- set.add(new ReqMatcher(Site.bilibili, pattern1, BodyDataType.html, biliCommentDataParser));
|
|
|
-
|
|
|
- //String pattern2 = "x/v2/reply/reply";
|
|
|
- //set.add(new ReqMatcher(Site.bilibili, pattern2, BodyDataType.html, biliCommentDataParser));
|
|
|
-
|
|
|
- /*String url = "https://www.bilibili.com/video/BV1bA4m1c7Vm/";
|
|
|
- AbstractChrome chrome = new ChromeBrowser(false, false);
|
|
|
- chrome.getAndHandleDynamicPage(url, set);*/
|
|
|
-
|
|
|
- getBiliComment();
|
|
|
- }
|
|
|
-
|
|
|
- @Autowired
|
|
|
- UrlResourceMongo urlResourceMongo;
|
|
|
- public void getBiliComment() {
|
|
|
- WebRequest webRequest = biliWebRequest();
|
|
|
-
|
|
|
- String site = Site.bilibili.name();
|
|
|
- String parser = BiliCommentDataParser.class.getSimpleName();
|
|
|
- List<UrlResource> list = urlResourceMongo.findNotCrawled(site, parser, 1000);
|
|
|
- for (UrlResource urlResource : list) {
|
|
|
- try {
|
|
|
- String url = urlResource.getUrl();
|
|
|
- WebResponse webResponse = webRequest.get(url);
|
|
|
- int statusCode = webResponse.getStatusCode();
|
|
|
- String body = webResponse.getBody();
|
|
|
- if (statusCode != 200 || body == null) {
|
|
|
- log.info("请求 {} 失败", url);
|
|
|
- continue;
|
|
|
- }
|
|
|
-
|
|
|
- Map<String, Object> map = biliCommentDataParser.parse(url, body);
|
|
|
- List<BiliComment> biliComment = (List<BiliComment>) map.get("biliComments");
|
|
|
- if (biliComment != null) {
|
|
|
- urlResourceMongo.updateCrawledTime(urlResource, System.currentTimeMillis());
|
|
|
- } else {
|
|
|
- log.info("解析 {} 失败", url);
|
|
|
- }
|
|
|
-
|
|
|
- Thread.sleep(1_000);
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- TextFile textFile = new TextFile();
|
|
|
- private WebRequest biliWebRequest() {
|
|
|
- String cookie = textFile.readFile("/home/reghao/Downloads/cookie.txt");
|
|
|
- if (cookie.isBlank()) {
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- String domain = ".bilibili.com";
|
|
|
- WebRequest webRequest = new DefaultWebRequest(cookie, domain);
|
|
|
- String url = "https://api.bilibili.com/x/member/web/account";
|
|
|
- WebResponse webResponse = webRequest.get(url);
|
|
|
- int statusCode = webResponse.getStatusCode();
|
|
|
- String body = webResponse.getBody();
|
|
|
- if (statusCode != 200 || body == null) {
|
|
|
- log.error("用户认证失败");
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- JsonObject jsonObject = JsonConverter.jsonToJsonElement(body).getAsJsonObject();
|
|
|
- int code = jsonObject.get("code").getAsInt();
|
|
|
- String msg = jsonObject.get("message").getAsString();
|
|
|
- if (code != 0) {
|
|
|
- log.error("用户认证失败");
|
|
|
- return null;
|
|
|
- }
|
|
|
-
|
|
|
- JsonObject jsonObject1 = jsonObject.get("data").getAsJsonObject();
|
|
|
- long mid = jsonObject1.get("mid").getAsLong();
|
|
|
- String username = jsonObject1.get("uname").getAsString();
|
|
|
- String userId = jsonObject1.get("userid").getAsString();
|
|
|
- return webRequest;
|
|
|
- }
|
|
|
-
|
|
|
- @Autowired
|
|
|
- BiliVideoMongo biliVideoMongo;
|
|
|
- @Autowired
|
|
|
- DataProducer dataProducer;
|
|
|
- @Test
|
|
|
- public void feedTest() throws InterruptedException {
|
|
|
- int size = 10_000;
|
|
|
- List<BiliVideo> list = biliVideoMongo.findByNotFeedVideo(size);
|
|
|
- for (BiliVideo biliVideo : list) {
|
|
|
- try {
|
|
|
- dataProducer.put(biliVideo);
|
|
|
- } catch (InterruptedException e) {
|
|
|
- e.printStackTrace();
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- Thread.sleep(3600*24*7);
|
|
|
- }
|
|
|
-
|
|
|
- @Autowired
|
|
|
- UnparsedDataMongo unparsedDataMongo;
|
|
|
- @Test
|
|
|
- public void parseTest() {
|
|
|
- String site = Site.bilibili.name();
|
|
|
- String parser = "BiliCommentDataParser";
|
|
|
- List<UnparsedData> list = unparsedDataMongo.findNotParsed(site, parser, 10_000);
|
|
|
- int i = 1;
|
|
|
- for (UnparsedData unparsedData : list) {
|
|
|
- parserBiliComment(unparsedData);
|
|
|
- log.info("处理完第 {} 文档", i++);
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- List<UnparsedData> unparsedDataList = new ArrayList<>();
|
|
|
- private void parserBiliComment(UnparsedData unparsedData) {
|
|
|
- String url = unparsedData.getUrl();
|
|
|
- String body = unparsedData.getData();
|
|
|
- try {
|
|
|
- Map<String, Object> result = biliCommentDataParser.parse(url, body);
|
|
|
- unparsedData.setParsed(1);
|
|
|
- } catch (Exception e) {
|
|
|
- e.printStackTrace();
|
|
|
- unparsedData.setParsed(2);
|
|
|
- } finally {
|
|
|
- //unparsedDataMongo.update(unparsedData);
|
|
|
- //unparsedDataList.add(unparsedData);
|
|
|
- }
|
|
|
- }
|
|
|
-}
|