reghao 2 лет назад
Родитель
Сommit
63d2da3437

+ 0 - 46
jdk/src/main/java/cn/reghao/jutil/jdk/http/DlResponse.java

@@ -1,46 +0,0 @@
-package cn.reghao.jutil.jdk.http;
-
-import java.io.ByteArrayOutputStream;
-
-/**
- * @author reghao
- * @date 2021-03-21 03:32:11
- */
-public class DlResponse {
-    private final int statusCode;
-    // byte
-    private final long contentLength;
-    // ms
-    private final long costTime;
-    private final String contentRange;
-    private final ByteArrayOutputStream result;
-
-    public DlResponse(int statusCode, long contentLength, long costTime,
-                      String contentRange, ByteArrayOutputStream result) {
-        this.statusCode = statusCode;
-        this.contentLength = contentLength;
-        this.costTime = costTime;
-        this.contentRange = contentRange;
-        this.result = result;
-    }
-
-    public int getStatusCode() {
-        return statusCode;
-    }
-
-    public long getContentLength() {
-        return contentLength;
-    }
-
-    public long getCostTime() {
-        return costTime;
-    }
-
-    public String getContentRange() {
-        return contentRange;
-    }
-
-    public ByteArrayOutputStream getResult() {
-        return result;
-    }
-}

+ 0 - 11
jdk/src/main/java/cn/reghao/jutil/jdk/http/HttpDownloader.java

@@ -1,11 +0,0 @@
-package cn.reghao.jutil.jdk.http;
-
-/**
- * @author reghao
- * @date 2021-07-31 22:38:08
- */
-public interface HttpDownloader {
-    int head(String url);
-    DlResponse download(String url);
-    boolean download(String url, String dir);
-}

+ 0 - 24
jdk/src/main/java/cn/reghao/jutil/jdk/http/ResStatus.java

@@ -1,24 +0,0 @@
-package cn.reghao.jutil.jdk.http;
-
-/**
- * HTTP 资源状态
- *
- * @author reghao
- * @date 2021-11-19 14:50:18
- */
-public enum ResStatus {
-    avail(200), notFound(404), notAvail(600);
-
-    private final int value;
-    ResStatus(int value) {
-        this.value = value;
-    }
-
-    public String getName() {
-        return this.name();
-    }
-
-    public Integer getValue() {
-        return value;
-    }
-}

+ 0 - 28
jdk/src/main/java/cn/reghao/jutil/jdk/http/proxy/ProxyType.java

@@ -1,28 +0,0 @@
-package cn.reghao.jutil.jdk.http.proxy;
-
-/**
- * HTTP 请求代理类型
- *
- * @author reghao
- * @date 2019-11-01 12:49:28
- */
-public enum ProxyType {
-    // HTTP 代理
-    HTTP("http"),
-    // HTTPS 代理
-    HTTPS("https"),
-    // SOCKS4 代理
-    SOCKS4("socks4"),
-    // SOCKS5 代理
-    SOCKS5("socks5");
-
-    private String value;
-
-    ProxyType(String value) {
-        this.value = value;
-    }
-
-    public String getValue() {
-        return value;
-    }
-}

+ 0 - 29
jdk/src/main/java/cn/reghao/jutil/jdk/http/proxy/RequestProxy.java

@@ -1,29 +0,0 @@
-package cn.reghao.jutil.jdk.http.proxy;
-
-/**
- * HTTP 请求代理
- *
- * @author reghao
- * @date 2019-12-17 13:21:38
- */
-public class RequestProxy {
-    private ProxyType type;
-    private String host;
-    private int port;
-    private String usrename;
-    private String password;
-
-    public RequestProxy(String host, int port) {
-        this.host = host;
-        this.port = port;
-        this.type = ProxyType.SOCKS5;
-    }
-
-    public String getHost() {
-        return host;
-    }
-
-    public int getPort() {
-        return port;
-    }
-}

+ 17 - 0
jdk/src/main/java/cn/reghao/jutil/jdk/text/TextFile.java

@@ -111,6 +111,23 @@ public class TextFile {
         return content.substring(0, index);
     }
 
+    public String readFile(File file) {
+        StringBuilder content = new StringBuilder();
+        try {
+            BufferedReader in =  new BufferedReader(new InputStreamReader(new FileInputStream(file)), bufSize);
+            String line;
+            while ((line = in.readLine()) != null) {
+                content.append(line).append(System.lineSeparator());
+            }
+            in.close();
+        } catch (IOException ioe) {
+            ioe.printStackTrace();
+        }
+
+        int index = content.lastIndexOf(System.lineSeparator());
+        return content.substring(0, index);
+    }
+
     /**
      * 向文件写入内容
      *

+ 39 - 33
tool/src/main/java/cn/reghao/jutil/tool/http/BaseWebRequest.java

@@ -1,6 +1,6 @@
 package cn.reghao.jutil.tool.http;
 
-import cn.reghao.jutil.jdk.http.proxy.RequestProxy;
+import cn.reghao.jutil.jdk.text.TextFile;
 import cn.reghao.jutil.tool.http.util.FakeDnsResolver;
 import cn.reghao.jutil.tool.http.util.MyConnectionSocketFactory;
 import cn.reghao.jutil.tool.http.util.MySSLConnectionSocketFactory;
@@ -18,12 +18,16 @@ import org.apache.http.impl.client.CloseableHttpClient;
 import org.apache.http.impl.client.HttpClientBuilder;
 import org.apache.http.impl.client.HttpClients;
 import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.impl.cookie.BasicClientCookie;
 import org.apache.http.protocol.HttpContext;
 import org.apache.http.ssl.SSLContexts;
 
+import java.io.File;
 import java.net.InetSocketAddress;
 import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Date;
 import java.util.List;
 
 /**
@@ -32,29 +36,28 @@ import java.util.List;
  */
 public class BaseWebRequest {
     protected final CloseableHttpClient client;
-    @Deprecated
-    protected final Charset charset;
     protected final String bodyCharset;
-    protected final String userAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko";
+    protected HttpContext context;
 
     public BaseWebRequest() {
         HttpClientBuilder builder = HttpClients.custom()
-                .setConnectionManager(connectionConfig())
+                .setConnectionManager(connectionConfig(false))
                 .setDefaultRequestConfig(requestConfig());
         //.setKeepAliveStrategy(keepAliveConfig())
         this.client = builder.build();
-        this.charset = StandardCharsets.UTF_8;
-        this.bodyCharset = "utf8";
+        this.bodyCharset = StandardCharsets.UTF_8.name();
+        this.context = HttpClientContext.create();
     }
 
-    public BaseWebRequest(String charsetName, boolean enableProxy) {
+    public BaseWebRequest(File cookieFile, String domain) {
         HttpClientBuilder builder = HttpClients.custom()
-                .setConnectionManager(connectionConfig(enableProxy))
+                .setConnectionManager(connectionConfig(false))
                 .setDefaultRequestConfig(requestConfig());
         //.setKeepAliveStrategy(keepAliveConfig())
         this.client = builder.build();
-        this.charset = StandardCharsets.UTF_8;
-        this.bodyCharset = charsetName;
+        this.bodyCharset = StandardCharsets.UTF_8.name();
+        this.context = HttpClientContext.create();
+        setCookies(cookieFile, domain);
     }
 
     /**
@@ -83,20 +86,6 @@ public class BaseWebRequest {
         return cm;
     }
 
-    /**
-     * 连接池配置
-     *
-     * @param
-     * @return
-     * @date 2021-03-23 下午6:21
-     */
-    private PoolingHttpClientConnectionManager connectionConfig() {
-        PoolingHttpClientConnectionManager cm = new PoolingHttpClientConnectionManager();
-        cm.setMaxTotal(50);
-        cm.setDefaultMaxPerRoute(20);
-        return cm;
-    }
-
     /**
      * 请求配置
      *
@@ -120,25 +109,42 @@ public class BaseWebRequest {
      * @return
      * @date 2021-03-24 上午2:04
      */
-    protected HttpContext httpContext(List<Cookie> cookies, RequestProxy proxy) {
+    protected HttpContext httpContext() {
         HttpContext context = HttpClientContext.create();
-        setCookies(context, cookies);
-        if (proxy != null) {
-            setProxy(context, proxy);
-        }
+        String host = "";
+        int port = 8888;
+        setProxy(context, host, port);
         return context;
     }
 
-    private void setCookies(HttpContext context, List<Cookie> cookies) {
+    private void setCookies(File cookieFile, String domain) {
+        TextFile textFile = new TextFile();
+        String cookieText = textFile.readFile(cookieFile);
+        String[] pairs = cookieText.replace("\\s+", "").split(";");
+        List<Cookie> cookies = new ArrayList<>();
+        for (String pair : pairs) {
+            String[] strs = pair.split("=");
+            String name = strs[0];
+            String value = strs[1];
+            BasicClientCookie cookie = new BasicClientCookie(name, value);
+            cookie.setAttribute("domain", domain);
+            cookie.setDomain(domain);
+            cookie.setPath("/");
+            long ms = (long)3600*24*180*1000 + System.currentTimeMillis();
+            cookie.setExpiryDate(new Date(ms));
+            cookies.add(cookie);
+        }
+
         // BasicClientCookie
         CookieStore cookieStore = new BasicCookieStore();
         cookies.forEach(cookieStore::addCookie);
+
         // 设置 cookies
         context.setAttribute(HttpClientContext.COOKIE_STORE, cookieStore);
     }
 
-    private void setProxy(HttpContext context, RequestProxy proxy) {
-        InetSocketAddress socketAddress = new InetSocketAddress(proxy.getHost(), proxy.getPort());
+    private void setProxy(HttpContext context, String host, int port) {
+        InetSocketAddress socketAddress = new InetSocketAddress(host, port);
         // 设置 SOCKS5 代理
         context.setAttribute("socks.address", socketAddress);
         // TODO 设置 HTTP/HTTPS 代理

+ 0 - 212
tool/src/main/java/cn/reghao/jutil/tool/http/DefaultHttpDownloader.java

@@ -1,212 +0,0 @@
-package cn.reghao.jutil.tool.http;
-
-import cn.reghao.jutil.jdk.http.DlResponse;
-import cn.reghao.jutil.jdk.http.HttpDownloader;
-import cn.reghao.jutil.jdk.http.util.UrlFormatter;
-import cn.reghao.jutil.jdk.http.util.UserAgents;
-import cn.reghao.jutil.jdk.http.proxy.RequestProxy;
-import org.apache.http.Header;
-import org.apache.http.HttpEntity;
-import org.apache.http.HttpResponse;
-import org.apache.http.client.methods.CloseableHttpResponse;
-import org.apache.http.client.methods.HttpGet;
-import org.apache.http.client.methods.HttpHead;
-import org.apache.http.protocol.HttpContext;
-
-import java.io.ByteArrayOutputStream;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-/**
- * @author reghao
- * @date 2019-11-29 10:03:18
- */
-public class DefaultHttpDownloader extends BaseWebRequest implements HttpDownloader {
-    public DefaultHttpDownloader() {
-    }
-
-    public DefaultHttpDownloader(boolean enableProxy) {
-        super("utf8", enableProxy);
-    }
-
-    @Override
-    public int head(String url) {
-        HttpHead head = new HttpHead(url);
-        try (CloseableHttpResponse response = client.execute(head)) {
-            return response.getStatusLine().getStatusCode();
-        } catch (IOException e) {
-            System.out.format("%s head 请求失败 -> %s%n", url, e.getMessage());
-        }
-
-        // 资源无法访问
-        return 600;
-    }
-
-    @Override
-    public DlResponse download(String url) {
-        HttpGet get = new HttpGet(url);
-        get.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        long start = System.currentTimeMillis();
-        try (CloseableHttpResponse response = client.execute(get)) {
-            int statusCode = response.getStatusLine().getStatusCode();
-            if (statusCode == 200) {
-                return dlResponse(response, statusCode, start);
-            } else if (statusCode == 206) {
-                return dlResponse(response, statusCode, start);
-            } else if (statusCode == 302) {
-                // 请求重定向
-                String location = response.getFirstHeader("Location").getValue();
-            } else if (statusCode == 404) {
-                System.out.format("%s 资源不存在", url);
-                return new DlResponse(statusCode, 0, 0, null, null);
-            }
-        } catch (IOException e) {
-            System.out.format("%s 下载失败 -> %s%n", url, e.getMessage());
-        }
-        return null;
-    }
-
-    private DlResponse dlResponse(HttpResponse response, int statusCode, long start) throws IOException {
-        Header header = response.getFirstHeader("Content-Length");
-        long contentLength;
-        if (header == null) {
-            contentLength = 0;
-        } else {
-            contentLength = Long.parseLong(header.getValue());
-        }
-
-        Header contentRangeHeader = response.getFirstHeader("Content-Range");
-        String contentRange = null;
-        if (contentRangeHeader != null) {
-            contentRange = contentRangeHeader.getValue();
-        }
-
-        HttpEntity entity = response.getEntity();
-        int avail = entity.getContent().available();
-        ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
-        entity.writeTo(byteArrayOutputStream);
-        long costTime = System.currentTimeMillis()-start;
-        return new DlResponse(statusCode, contentLength, costTime, contentRange, byteArrayOutputStream);
-    }
-
-    @Override
-    public boolean download(String url, String dir) {
-        HttpGet get = new HttpGet(url);
-        get.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        long start = System.currentTimeMillis();
-        try (CloseableHttpResponse response = client.execute(get)) {
-            int statusCode = response.getStatusLine().getStatusCode();
-            if (statusCode == 200) {
-                HttpEntity httpEntity = response.getEntity();
-                String contentType = httpEntity.getContentType().getValue();
-                String filename = UrlFormatter.getFilename(url);
-                File file = new File(dir + File.separator + filename);
-                FileOutputStream fout = new FileOutputStream(file);
-                // 持续写到本地文件,直到服务器没有数据
-                httpEntity.writeTo(fout);
-                return true;
-            }
-        } catch (IOException e) {
-            System.out.format("%s 下载失败 -> %s%n", url, e.getMessage());
-        }
-        return false;
-    }
-
-    public boolean download(String url, RequestProxy requestProxy, String dir) {
-        HttpGet get = new HttpGet(url);
-        get.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        HttpContext context = httpContext(Collections.emptyList(), requestProxy);
-        long start = System.currentTimeMillis();
-        try (CloseableHttpResponse response = client.execute(get, context)) {
-            int statusCode = response.getStatusLine().getStatusCode();
-            if (statusCode == 200) {
-                HttpEntity httpEntity = response.getEntity();
-                String contentType = httpEntity.getContentType().getValue();
-                String filename = UrlFormatter.getFilename(url);
-                File file = new File(dir + File.separator + filename);
-                FileOutputStream fout = new FileOutputStream(file);
-                // 持续写到本地文件,直到服务器没有数据
-                httpEntity.writeTo(fout);
-                return true;
-            }
-        } catch (IOException e) {
-            System.out.format("%s 下载失败 -> %s%n", url, e.getMessage());
-        }
-        return false;
-    }
-
-    /**
-     * 将字节数组流保存到文件
-     *
-     * @param
-     * @return
-     * @date 2021-03-15 下午10:13
-     */
-    private void saveFile(ByteArrayOutputStream byteArrayOutputStream, File file) throws IOException {
-        FileOutputStream fout = new FileOutputStream(file);
-        fout.write(byteArrayOutputStream.toByteArray());
-        fout.flush();
-        fout.close();
-    }
-
-    public long acceptRanges(String url) throws IOException {
-        HttpHead httpHead = new HttpHead(url);
-        HttpResponse response = client.execute(httpHead);
-        int statusCode = response.getStatusLine().getStatusCode();
-        if (statusCode == 200) {
-            if (response.getFirstHeader("Accept-Ranges") != null) {
-                Header contentLengthHeader = response.getFirstHeader("Content-Length");
-                return Long.parseLong(contentLengthHeader.getValue());
-            }
-        }
-
-        return 0;
-    }
-
-    public List<String> splitContent(long contentLength) {
-        // 100KiB
-        long maxFragment = 1024*100;
-        // 10MiB
-        //long maxFragment = 1024*1024*10;
-        List<String> list = new ArrayList<>();
-        if (contentLength < maxFragment) {
-            list.add("bytes=0-" + (contentLength - 1));
-        } else {
-            long i = 0;
-            for (;i + maxFragment < contentLength; i += maxFragment) {
-                list.add("bytes=" + i + "-" + (i+maxFragment-1));
-            }
-            list.add("bytes=" + i + "-" + (contentLength-1));
-        }
-        return list;
-    }
-
-    public DlResponse download(HttpGet httpGet) {
-        httpGet.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        HttpContext context = httpContext(new ArrayList<>(), null);
-
-        long start = System.currentTimeMillis();
-        try (CloseableHttpResponse response = client.execute(httpGet, context)) {
-            int statusCode = response.getStatusLine().getStatusCode();
-            if (statusCode == 200) {
-                return dlResponse(response, statusCode, start);
-            } else if (statusCode == 206) {
-                return dlResponse(response, statusCode, start);
-            } else if (statusCode == 302) {
-                //
-                String location = response.getFirstHeader("Location").getValue();
-            } else if (statusCode == 404) {
-                System.out.format("资源下载失败 -> %s%n", httpGet.getURI());
-                return new DlResponse(statusCode, 0, 0, null, null);
-            }
-        } catch (Exception e) {
-            System.out.format("资源下载失败 -> %s%n", httpGet.getURI());
-            e.printStackTrace();
-        }
-        return null;
-    }
-}

+ 6 - 3
tool/src/main/java/cn/reghao/jutil/tool/http/DefaultWebRequest.java

@@ -35,6 +35,10 @@ public class DefaultWebRequest extends BaseWebRequest implements WebRequest {
         super();
     }
 
+    public DefaultWebRequest(File cookieFile, String domain) {
+        super(cookieFile, domain);
+    }
+
     public DefaultWebRequest(Map<String, String> headers) {
         super();
         this.headers = headers;
@@ -178,11 +182,10 @@ public class DefaultWebRequest extends BaseWebRequest implements WebRequest {
             headers.forEach(request::addHeader);
         }
         request.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        try (CloseableHttpResponse response = client.execute(request)) {
+        try (CloseableHttpResponse response = client.execute(request, context)) {
             StatusLine statusLine = response.getStatusLine();
             int statusCode = statusLine.getStatusCode();
-            //String body = EntityUtils.toString(response.getEntity(), charset);
-            String body = EntityUtils.toString(response.getEntity(), charset);
+            String body = EntityUtils.toString(response.getEntity(), bodyCharset);
             return new WebResponse(statusCode, body);
         } catch (Exception e) {
             // TODO 是否应该放在 finally 块中?

+ 0 - 99
tool/src/main/java/cn/reghao/jutil/tool/http/JdkCrawlRequest.java

@@ -1,99 +0,0 @@
-package cn.reghao.jutil.tool.http;
-
-import cn.reghao.jutil.jdk.http.WebResponse;
-import cn.reghao.jutil.jdk.http.util.UrlFormatter;
-import cn.reghao.jutil.jdk.http.util.UserAgents;
-import org.apache.commons.io.FileUtils;
-
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.net.URI;
-import java.net.http.HttpClient;
-import java.net.http.HttpRequest;
-import java.net.http.HttpResponse;
-import java.text.MessageFormat;
-import java.time.Duration;
-import java.util.logging.Logger;
-
-/**
- * 爬虫请求
- *
- * @author reghao
- * @date 2022-02-28 15:27:55
- */
-public class JdkCrawlRequest {
-    private static final Logger log = Logger.getLogger(JdkCrawlRequest.class.getName());
-
-    private final HttpClient client = HttpClient.newBuilder()
-            .version(HttpClient.Version.HTTP_1_1)
-            .build();
-    
-    public int head(String url) {
-        HttpRequest.Builder builder = HttpRequest.newBuilder()
-                .uri(URI.create(url))
-                .timeout(Duration.ofSeconds(5))
-                .GET();
-        builder.setHeader("User-Agent", UserAgents.getDesktopAgent());
-
-        try {
-            HttpResponse<String> response = client.send(builder.build(), HttpResponse.BodyHandlers.ofString());
-            return response.statusCode();
-        } catch (Exception e) {
-            log.info(MessageFormat.format("{0} 请求失败 -> {1}", url, e.getMessage()));
-            return 600;
-        }
-    }
-
-    public WebResponse get(String url) {
-        HttpRequest.Builder builder = HttpRequest.newBuilder()
-                .uri(URI.create(url))
-                .timeout(Duration.ofSeconds(5))
-                .GET();
-        builder.setHeader("User-Agent", UserAgents.getDesktopAgent());
-
-        try {
-            HttpResponse<String> response = client.send(builder.build(), HttpResponse.BodyHandlers.ofString());
-            int statusCode = response.statusCode();
-            String body = response.body();
-            return new WebResponse(statusCode, body);
-        } catch (Exception e) {
-            log.info(MessageFormat.format("{0} 请求失败 -> {1}", url, e.getMessage()));
-            return new WebResponse(600, e.getMessage());
-        }
-    }
-
-    public void download(String url, String dir) throws IOException, InterruptedException {
-        HttpRequest.Builder builder = HttpRequest.newBuilder()
-                .uri(URI.create(url))
-                .timeout(Duration.ofSeconds(30))
-                .GET();
-        builder.setHeader("User-Agent", UserAgents.getDesktopAgent());
-        try {
-            HttpResponse<InputStream> in = client.send(builder.build(), HttpResponse.BodyHandlers.ofInputStream());
-            String filename = UrlFormatter.getFilename(url);
-            File file = new File(dir + File.separator + filename);
-            saveFile(in.body(), file);
-        } catch (Exception e) {
-            throw e;
-        }
-    }
-
-    private void saveFile(InputStream in, File file) throws IOException {
-        File parentDir = file.getParentFile();
-        if (!parentDir.exists()) {
-            FileUtils.forceMkdir(parentDir);
-        }
-
-        FileOutputStream fos = new FileOutputStream(file);
-        // 1MiB
-        int len = 1024*1024;
-        byte[] buf = new byte[len];
-        int readLen;
-        while ((readLen = in.read(buf, 0, len)) != -1) {
-            fos.write(buf, 0, readLen);
-        }
-        fos.close();
-    }
-}