Procházet zdrojové kódy

content/app/data 中添加一个 news 接口提供新闻服务, 它会定时爬取指定新闻网站上的内容

reghao před 7 měsíci
rodič
revize
b65dcdd340

+ 44 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/controller/NewsPostController.java

@@ -0,0 +1,44 @@
+package cn.reghao.tnb.content.app.data.controller;
+
+import cn.reghao.jutil.jdk.db.PageList;
+import cn.reghao.jutil.web.WebResult;
+import cn.reghao.tnb.content.app.data.model.vo.NewsPost;
+import cn.reghao.tnb.content.app.data.service.NewsPostService;
+import io.swagger.v3.oas.annotations.Operation;
+import io.swagger.v3.oas.annotations.tags.Tag;
+import org.springframework.web.bind.annotation.*;
+
+/**
+ * @author reghao
+ * @date 2025-07-29 14:10:28
+ */
+@Tag(name = "新闻稿件接口")
+@RestController
+@RequestMapping("/api/content/post/news")
+public class NewsPostController {
+    private final NewsPostService newsPostService;
+
+    public NewsPostController(NewsPostService newsPostService) {
+        this.newsPostService = newsPostService;
+    }
+
+    @Operation(summary = "发布新闻稿件", description = "N")
+    @PostMapping("/publish")
+    public String publishVideoPost() {
+        return WebResult.success();
+    }
+
+    @Operation(summary = "获取新闻稿件列表", description = "N")
+    @GetMapping("/list")
+    public String getNewsPosts(@RequestParam("pn") Integer pn) {
+        PageList<NewsPost> pageList = newsPostService.getByPage(pn);
+        return WebResult.success(pageList);
+    }
+
+    @Operation(summary = "获取新闻稿件", description = "N")
+    @GetMapping("/detail/{newsId}")
+    public String getNewsPost(@PathVariable("newsId") Long newsId) {
+        NewsPost newsPost = newsPostService.getById(newsId);
+        return WebResult.success(newsPost);
+    }
+}

+ 122 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/db/mongo/RichTextMongo.java

@@ -0,0 +1,122 @@
+package cn.reghao.tnb.content.app.data.db.mongo;
+
+import cn.reghao.jutil.jdk.db.BaseCrud;
+import cn.reghao.jutil.jdk.db.BaseQuery;
+import cn.reghao.tnb.content.app.data.model.po.RichText;
+import com.mongodb.MongoBulkWriteException;
+import com.mongodb.client.model.InsertManyOptions;
+import com.mongodb.client.result.DeleteResult;
+import com.mongodb.client.result.InsertManyResult;
+import lombok.extern.slf4j.Slf4j;
+import org.bson.Document;
+import org.springframework.data.domain.Sort;
+import org.springframework.data.mongodb.core.MongoTemplate;
+import org.springframework.data.mongodb.core.convert.MongoConverter;
+import org.springframework.data.mongodb.core.query.Criteria;
+import org.springframework.data.mongodb.core.query.Query;
+import org.springframework.stereotype.Repository;
+
+import java.time.LocalDateTime;
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * @author reghao
+ * @date 2023-02-01 14:19:02
+ */
+@Slf4j
+@Repository
+public class RichTextMongo implements BaseCrud<RichText>, BaseQuery<RichText> {
+    private final String colName = "RichText";
+    private final MongoTemplate mongoTemplate;
+    private final MongoConverter mongoConverter;
+    private final int pageSize = 1000;
+
+    public RichTextMongo(MongoTemplate mongoTemplate, MongoConverter mongoConverter) {
+        this.mongoTemplate = mongoTemplate;
+        this.mongoConverter = mongoConverter;
+    }
+
+    @Override
+    public RichText save(RichText richText) {
+        Document doc = new Document();
+        mongoConverter.write(richText, doc);
+        mongoTemplate.getCollection(colName).insertOne(doc);
+        return null;
+    }
+
+    @Override
+    public void saveAll(List<RichText> list) {
+        List<Document> documents = list.stream()
+                .map(t -> {
+                    Document doc = new Document();
+                    mongoConverter.write(t, doc);
+                    return doc;
+                })
+                .collect(Collectors.toList());
+
+        InsertManyOptions options = new InsertManyOptions();
+        // 忽略 insert 失败的文档
+        options.ordered(false);
+        try {
+            InsertManyResult result = mongoTemplate.getCollection(colName).insertMany(documents, options);
+        }  catch (MongoBulkWriteException ignore) {
+        }
+    }
+
+    @Override
+    public void update(RichText richText) {
+        richText.setUpdateTime(LocalDateTime.now());
+
+        Document document = new Document();
+        mongoConverter.write(richText, document);
+        Document filter = new Document();
+        filter.put("contentId", richText.getContentId());
+        mongoTemplate.getCollection(colName).replaceOne(filter, document);
+    }
+
+    @Override
+    public void delete(RichText richText) {
+        Query query = new Query();
+        query.addCriteria(Criteria.where("contentId").is(richText.getContentId()));
+        DeleteResult deleteResult = mongoTemplate.remove(query, colName);
+        if (deleteResult.getDeletedCount() != 1) {
+        }
+    }
+
+    public long countByUserId(long userId) {
+        Document filter = new Document();
+        filter.put("userId", userId);
+        return mongoTemplate.getCollection(colName).countDocuments(filter);
+    }
+
+    public List<RichText> findAll(long page) {
+        Query query = new Query();
+        query.addCriteria(Criteria.where("content").ne(null));
+        query.skip((page - 1) * pageSize).limit(pageSize);
+        query.with(Sort.by(new Sort.Order(Sort.Direction.DESC, "publishAt")));
+        return mongoTemplate.find(query, RichText.class, colName);
+    }
+
+    public List<RichText> findByContentIsNull() {
+        Query query = new Query();
+        query.addCriteria(Criteria.where("content").is(null));
+        query.skip(0).limit(1000);
+        query.with(Sort.by(new Sort.Order(Sort.Direction.DESC, "publishAt")));
+        return mongoTemplate.find(query, RichText.class, colName);
+    }
+
+    public RichText findByOriginalUrl(String originalUrl) {
+        Query query = new Query();
+        query.addCriteria(Criteria.where("originalUrl").is(originalUrl));
+        List<RichText> list = mongoTemplate.find(query, RichText.class, colName);
+        return list.isEmpty() ? null : list.get(0);
+    }
+
+    public RichText findByContentId(long contentId) {
+        Query query = new Query();
+        query.addCriteria(Criteria.where("contentId").is(contentId));
+        List<RichText> list = mongoTemplate.find(query, RichText.class, colName);
+        return list.isEmpty() ? null : list.get(0);
+    }
+}

+ 1 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/model/po/ArticlePost.java

@@ -15,6 +15,7 @@ import lombok.Setter;
 @NoArgsConstructor
 @Setter
 @Getter
+@Deprecated
 public class ArticlePost extends BaseObject<String> {
     private String articleId;
     private String title;

+ 28 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/model/po/RichText.java

@@ -0,0 +1,28 @@
+package cn.reghao.tnb.content.app.data.model.po;
+
+import cn.reghao.jutil.jdk.db.BaseObject;
+import lombok.AllArgsConstructor;
+import lombok.Getter;
+import lombok.NoArgsConstructor;
+import lombok.Setter;
+
+/**
+ * @author reghao
+ * @date 2025-07-29 15:52:55
+ */
+@AllArgsConstructor
+@NoArgsConstructor
+@Setter
+@Getter
+public class RichText extends BaseObject<String> {
+    private Long contentId;
+    private String contentType;
+    private String title;
+    private String coverUrl;
+    private String excerpt;
+    private String content;
+    private Integer scope;
+    private String originalUrl;
+    private Long publishAt;
+    private Long publishBy;
+}

+ 30 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/model/vo/NewsPost.java

@@ -0,0 +1,30 @@
+package cn.reghao.tnb.content.app.data.model.vo;
+
+import cn.reghao.jutil.jdk.converter.DateTimeConverter;
+import cn.reghao.tnb.content.app.data.model.po.RichText;
+import lombok.Getter;
+import lombok.Setter;
+
+/**
+ * @author reghao
+ * @date 2025-07-29 13:55:57
+ */
+@Setter
+@Getter
+public class NewsPost {
+    private long contentId;
+    private String title;
+    private String coverUrl;
+    private String content;
+    private String publishAt;
+    private String originalUrl;
+
+    public NewsPost(RichText richText) {
+        this.contentId = richText.getContentId();
+        this.title = richText.getTitle();
+        this.coverUrl = richText.getCoverUrl();
+        this.content = richText.getContent();
+        this.publishAt = DateTimeConverter.format(richText.getPublishAt());
+        this.originalUrl = richText.getOriginalUrl();
+    }
+}

+ 43 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/service/NewsPostService.java

@@ -0,0 +1,43 @@
+package cn.reghao.tnb.content.app.data.service;
+
+import cn.reghao.jutil.jdk.db.PageList;
+import cn.reghao.jutil.tool.id.SnowFlake;
+import cn.reghao.tnb.content.app.data.db.mongo.RichTextMongo;
+import cn.reghao.tnb.content.app.data.model.vo.NewsPost;
+import cn.reghao.tnb.content.app.data.model.po.RichText;
+import org.springframework.stereotype.Service;
+
+import java.util.List;
+import java.util.stream.Collectors;
+
+/**
+ * @author reghao
+ * @date 2025-07-29 14:10:11
+ */
+@Service
+public class NewsPostService {
+    private final RichTextMongo richTextMongo;
+    private final SnowFlake idGenerator;
+
+    public NewsPostService(RichTextMongo richTextMongo) {
+        this.richTextMongo = richTextMongo;
+        this.idGenerator = new SnowFlake(1, 1);
+    }
+
+    public void add() {
+    }
+
+    public PageList<NewsPost> getByPage(int pn) {
+        int ps = 100;
+        List<NewsPost> list = richTextMongo.findAll(pn).stream()
+                .map(NewsPost::new)
+                .collect(Collectors.toList());
+        int total = list.size();
+        return PageList.pageList(pn, ps, total, list);
+    }
+
+    public NewsPost getById(long newsId) {
+        RichText richText = richTextMongo.findByContentId(newsId);
+        return new NewsPost(richText);
+    }
+}

+ 39 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/spider/TaskContext.java

@@ -0,0 +1,39 @@
+package cn.reghao.tnb.content.app.data.spider;
+
+import cn.reghao.jutil.jdk.http.WebRequest;
+import cn.reghao.jutil.jdk.thread.ThreadPoolWrapper;
+import cn.reghao.jutil.tool.http.DefaultWebRequest;
+import cn.reghao.tnb.content.app.data.db.mongo.RichTextMongo;
+import cn.reghao.tnb.content.app.data.spider.task.GetNewsDetailTask;
+import cn.reghao.tnb.content.app.data.spider.task.GetNewsIndexTask;
+import org.springframework.stereotype.Component;
+
+import javax.annotation.PostConstruct;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.ScheduledFuture;
+import java.util.concurrent.TimeUnit;
+
+/**
+ * @author reghao
+ * @date 2025-07-30 15:48:04
+ */
+@Component
+public class TaskContext {
+    private final ScheduledExecutorService scheduler = ThreadPoolWrapper.scheduledThreadPool("get-news", 5);
+    private final WebRequest webRequest = new DefaultWebRequest();
+    private final RichTextMongo richTextMongo;
+
+    public TaskContext(RichTextMongo richTextMongo) {
+        this.richTextMongo = richTextMongo;
+    }
+
+    @PostConstruct
+    public void startTask() {
+        GetNewsDetailTask getNewsDetailTask = new GetNewsDetailTask(webRequest, richTextMongo);
+        scheduler.scheduleAtFixedRate(getNewsDetailTask, 0, 1, TimeUnit.HOURS);
+
+        int pastHours = 4;
+        GetNewsIndexTask getNewsIndexTask = new GetNewsIndexTask(webRequest, richTextMongo, pastHours);
+        ScheduledFuture<?> future = scheduler.scheduleAtFixedRate(getNewsIndexTask, 0, pastHours, TimeUnit.HOURS);
+    }
+}

+ 59 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/spider/task/GetNewsDetailTask.java

@@ -0,0 +1,59 @@
+package cn.reghao.tnb.content.app.data.spider.task;
+
+import cn.reghao.jutil.jdk.http.WebRequest;
+import cn.reghao.jutil.jdk.http.WebResponse;
+import cn.reghao.tnb.content.app.data.db.mongo.RichTextMongo;
+import cn.reghao.tnb.content.app.data.model.po.RichText;
+import lombok.extern.slf4j.Slf4j;
+import org.jsoup.Jsoup;
+import org.jsoup.nodes.Document;
+import org.jsoup.select.Elements;
+
+import java.util.List;
+
+/**
+ * @author reghao
+ * @date 2025-07-30 15:44:54
+ */
+@Slf4j
+public class GetNewsDetailTask implements Runnable {
+    private final WebRequest webRequest;
+    private final RichTextMongo richTextMongo;
+
+    public GetNewsDetailTask(WebRequest webRequest, RichTextMongo richTextMongo) {
+        this.webRequest = webRequest;
+        this.richTextMongo = richTextMongo;
+    }
+
+    @Override
+    public void run() {
+        List<RichText> richTextList = richTextMongo.findByContentIsNull();
+        for (RichText richText : richTextList) {
+            try {
+                String pageUrl = richText.getOriginalUrl();
+                WebResponse webResponse = webRequest.get(pageUrl);
+                if (webResponse.getStatusCode() != 200) {
+                    log.error("get {} failed", pageUrl);
+                    continue;
+                }
+
+                String body = webResponse.getBody();
+                String html = parsePaperNewsDetail(body);
+                if (html != null) {
+                    richText.setContentType("html");
+                    richText.setContent(html);
+                    richTextMongo.update(richText);
+                }
+                Thread.sleep(1_000);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+    }
+
+    private String parsePaperNewsDetail(String body) {
+        Document document = Jsoup.parse(body);
+        Elements elements = document.select(".index_wrapper__L_zqV");
+        return elements.size() > 0 ? elements.html() : null;
+    }
+}

+ 121 - 0
content/content-service/src/main/java/cn/reghao/tnb/content/app/data/spider/task/GetNewsIndexTask.java

@@ -0,0 +1,121 @@
+package cn.reghao.tnb.content.app.data.spider.task;
+
+import cn.reghao.jutil.jdk.converter.DateTimeConverter;
+import cn.reghao.jutil.jdk.http.WebRequest;
+import cn.reghao.jutil.jdk.http.WebResponse;
+import cn.reghao.jutil.jdk.serializer.JsonConverter;
+import cn.reghao.jutil.tool.id.SnowFlake;
+import cn.reghao.tnb.content.app.data.db.mongo.RichTextMongo;
+import cn.reghao.tnb.content.app.data.model.po.RichText;
+import com.google.gson.JsonArray;
+import com.google.gson.JsonElement;
+import com.google.gson.JsonObject;
+import lombok.extern.slf4j.Slf4j;
+
+import java.time.Duration;
+import java.time.LocalDate;
+import java.time.LocalDateTime;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * @author reghao
+ * @date 2025-05-06 11:33:42
+ */
+@Slf4j
+public class GetNewsIndexTask implements Runnable {
+    private final SnowFlake idGenerator = new SnowFlake(1, 1);
+    private final WebRequest webRequest;
+    private final RichTextMongo richTextMongo;
+    private final int pastHours;
+
+    public GetNewsIndexTask(WebRequest webRequest, RichTextMongo richTextMongo, int pastHours) {
+        this.webRequest = webRequest;
+        this.richTextMongo = richTextMongo;
+        this.pastHours = pastHours;
+    }
+
+    @Override
+    public void run() {
+        LocalDateTime current = LocalDate.now().atStartOfDay();
+        List<Integer> excludeContIds = List.of(30769556,30772115,30772061,30770621,30772060,30771769,30771960,30772059,30772058,30770148,30769570,30767725,30772111,30772110,30767342,30772330);
+        List<Integer> listRecommendIds = List.of(30769556,30772330,30771960,30771769);
+
+        String url1 = "https://api.thepaper.cn/contentapi/nodeCont/getByChannelId";
+        Map<String, Object> map = new HashMap<>();
+        map.put("channelId", "25950");
+        map.put("excludeContIds", excludeContIds);
+        map.put("listRecommendIds", listRecommendIds);
+        map.put("pageSize", "20");
+
+        int pageNum = 1;
+        long startTime = System.currentTimeMillis();
+        while (pageNum < 50) {
+            try {
+                map.put("pageNum", String.valueOf(pageNum++));
+                map.put("startTime", String.valueOf(startTime));
+
+                WebResponse webResponse = webRequest.postJson(url1, JsonConverter.objectToJson(map));
+                if (webResponse.getStatusCode() != 200) {
+                    log.error("get {} failed", url1);
+                    return;
+                }
+
+                String body = webResponse.getBody();
+                JsonObject jsonObject = JsonConverter.jsonToJsonElement(body).getAsJsonObject();
+                int code = jsonObject.get("code").getAsInt();
+                if (code != 200) {
+                    log.error("get {} failed", url1);
+                    return;
+                }
+
+                JsonObject dataObject = jsonObject.get("data").getAsJsonObject();
+                startTime = dataObject.get("startTime").getAsLong();
+                JsonArray jsonArray = dataObject.get("list").getAsJsonArray();
+                List<RichText> richTextList = new ArrayList<>();
+                for (JsonElement jsonElement : jsonArray) {
+                    JsonObject jsonObject1 = jsonElement.getAsJsonObject();
+                    String name = jsonObject1.get("name").getAsString();
+                    String pic = jsonObject1.get("pic").getAsString();
+                    long contId = jsonObject1.get("contId").getAsLong();
+                    long pubTimeLong = jsonObject1.get("pubTimeLong").getAsLong();
+                    String pageUrl = String.format("https://www.thepaper.cn/newsDetail_forward_%s", contId);
+                    RichText richText = richTextMongo.findByOriginalUrl(pageUrl);
+                    if (richText == null) {
+                        long contentId = idGenerator.nextId();
+                        richText = new RichText();
+                        richText.setContentId(contentId);
+                        richText.setOriginalUrl(pageUrl);
+                        richText.setTitle(name);
+                        richText.setCoverUrl(pic);
+                        richText.setPublishAt(pubTimeLong);
+                        richTextList.add(richText);
+                    }
+                }
+
+                if (!richTextList.isEmpty()) {
+                    richTextMongo.saveAll(richTextList);
+
+                    richTextList.sort((o1, o2) -> {
+                        long l = o1.getPublishAt() - o2.getPublishAt();
+                        return (int) l;
+                    });
+
+                    long latestPubTime = richTextList.get(richTextList.size()-1).getPublishAt();
+                    LocalDateTime pubDateTime = DateTimeConverter.localDateTime(latestPubTime);
+                    Duration duration = Duration.between(current, pubDateTime);
+                    if (duration.toHours()+pastHours < 0) {
+                        break;
+                    }
+                }
+                Thread.sleep(1_000);
+            } catch (Exception e) {
+                e.printStackTrace();
+            }
+        }
+
+        log.info("{} done", this.getClass().getSimpleName());
+    }
+}