Quellcode durchsuchen

1.ObjectType 中新增几种文件类型
2.调整 oss-store 中 tika 的文件类型识别

reghao vor 4 Tagen
Ursprung
Commit
bd92aa1bd4

+ 5 - 1
oss-api/src/main/java/cn/reghao/oss/api/constant/ObjectType.java

@@ -14,7 +14,11 @@ public enum ObjectType {
     Audio(1003, "Audio"),
     Text(1004, "Text"),
     Other(1005, "Application"),
-    Any(1006, "*/*");
+    Any(1006, "*/*"),
+    DocPdf(1007, "DocPdf"),
+    DocWord(1008, "DocWord"),
+    DocExcel(1009, "DocExcel"),
+    DocPpt(1010, "DocPpt");
 
     private final int code;
     // contentType 的值不能改变, 代码中会使用它作为前缀和上传文件的 content-type 进行比较

+ 9 - 1
oss-mgr/src/main/java/cn/reghao/oss/mgr/rpc/ConsoleServiceImpl.java

@@ -83,7 +83,15 @@ public class ConsoleServiceImpl implements ConsoleService {
             fileType = ObjectType.Audio.getValue();
         } else if (contentType.startsWith("text")) {
             fileType = ObjectType.Text.getValue();
-        } else if (contentType.startsWith("application")) {
+        } else if (contentType.equals("application/pdf")) {
+            fileType = ObjectType.DocPdf.getValue();
+        } else if (contentType.equals("application/vnd.openxmlformats-officedocument.wordprocessingml.document")) {
+            fileType = ObjectType.DocWord.getValue();
+        } else if (contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")) {
+            fileType = ObjectType.DocExcel.getValue();
+        } else if (contentType.equals("application/vnd.openxmlformats-officedocument.presentationml.presentation")) {
+            fileType = ObjectType.DocPpt.getValue();
+        }  else if (contentType.startsWith("application/")) {
             fileType = ObjectType.Other.getValue();
         }
 

+ 6 - 1
oss-store/pom.xml

@@ -59,7 +59,12 @@
         <dependency>
             <groupId>org.apache.tika</groupId>
             <artifactId>tika-core</artifactId>
-            <version>2.9.1</version>
+            <version>3.2.2</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.tika</groupId>
+            <artifactId>tika-parsers-standard-package</artifactId>
+            <version>3.2.2</version>
         </dependency>
 
         <dependency>

+ 4 - 7
oss-store/src/main/java/cn/reghao/oss/store/disk/HddFlushService.java

@@ -4,10 +4,8 @@ import cn.reghao.jutil.jdk.thread.ThreadFactoryBuilder;
 import cn.reghao.oss.api.dto.rest.UploadDoneResult;
 import cn.reghao.oss.api.iface.ConsoleService;
 import cn.reghao.oss.store.config.OssStoreConfig;
+import cn.reghao.oss.store.util.FileUtil;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.tika.Tika;
-import org.apache.tika.io.TikaInputStream;
-import org.apache.tika.metadata.Metadata;
 import org.springframework.stereotype.Service;
 
 import java.io.File;
@@ -24,7 +22,6 @@ import java.util.concurrent.*;
 @Slf4j
 @Service
 public class HddFlushService {
-    private static final Tika TIKA = new Tika();
     private static final int BUFFER_SIZE = 128 * 1024; // 128KB 缓冲区,利于 CPU 缓存
     // 动态计算搬运并发度
     private static final int MOVE_THREADS = Math.max(1, Runtime.getRuntime().availableProcessors() / 2);
@@ -145,13 +142,13 @@ public class HddFlushService {
     private FlushResult moveAndChecksum(String src, String dest) throws Exception {
         MessageDigest digest = MessageDigest.getInstance("SHA-256");
         Path srcPath = Paths.get(src);
-        String detectedType;
+        String detectedType = FileUtil.getContentType(src);
 
         // 1. Tika 识别:利用 Path 获取,它内部会按需打开 Channel
         // 只读文件头,速度极快,不影响后续搬运
-        try (TikaInputStream tikaStream = TikaInputStream.get(srcPath)) {
+        /*try (TikaInputStream tikaStream = TikaInputStream.get(srcPath)) {
             detectedType = TIKA.detect(tikaStream, new Metadata());
-        }
+        }*/
 
         // 2. 核心搬运:使用 FileChannel + DirectBuffer
         try (FileChannel srcChannel = FileChannel.open(srcPath, StandardOpenOption.READ);

+ 3 - 43
oss-store/src/main/java/cn/reghao/oss/store/handler/OssMultipartUploadHandler.java

@@ -6,7 +6,7 @@ import cn.reghao.oss.api.dto.rest.FastUploadResult;
 import cn.reghao.oss.api.dto.rest.UploadResult;
 import cn.reghao.oss.api.dto.rest.UploadFileRet;
 import cn.reghao.oss.api.iface.ConsoleService;
-import cn.reghao.oss.api.util.FileUtil;
+import cn.reghao.oss.store.util.FileUtil;
 import cn.reghao.oss.api.util.OssSamplingHash;
 import cn.reghao.oss.store.config.OssStoreConfig;
 import cn.reghao.oss.store.disk.DiskService;
@@ -14,7 +14,6 @@ import cn.reghao.oss.store.disk.HddFlushService;
 import cn.reghao.oss.store.util.ResponseHelper;
 import cn.reghao.oss.store.util.UploadProgressManager;
 import io.netty.buffer.ByteBuf;
-import io.netty.buffer.Unpooled;
 import io.netty.channel.ChannelHandlerContext;
 import io.netty.channel.SimpleChannelInboundHandler;
 import io.netty.handler.codec.http.*;
@@ -22,7 +21,6 @@ import io.netty.handler.codec.http.multipart.*;
 import io.netty.util.AttributeKey;
 import io.netty.util.ReferenceCountUtil;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.tika.Tika;
 
 import java.io.File;
 import java.io.IOException;
@@ -48,7 +46,6 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
     private HttpPostMultipartRequestDecoder decoder;
 
     // 当前请求的上下文状态
-    private static final Tika tika = new Tika();
     private final Map<String, String> formData = new HashMap<>();
     private final ConsoleService consoleService;
     private final HddFlushService hddFlushService;
@@ -56,9 +53,6 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
     private FileChannel singleFileChannel;
     private String singleFileId;
     private final MessageDigest messageDigest;
-    // 8 字节
-    private final ByteBuf headerCollector = Unpooled.buffer(8);
-    private boolean isTypeIdentified = false;
     private final OssStoreConfig ossStoreConfig;
 
     public OssMultipartUploadHandler(OssStoreConfig ossStoreConfig, ConsoleService consoleService,
@@ -167,26 +161,6 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
         if (chunk != null) {
             // Nginx 的 client_body_buffer_size参数会显著影响 Netty 接收到的第一个 chunk 的大小
             try {
-                if (!isTypeIdentified) {
-                    // 将新到的数据写入 collector,最多写满 8 字节
-                    int need = 8 - headerCollector.readableBytes();
-                    if (need > 0) {
-                        headerCollector.writeBytes(chunk, Math.min(need, chunk.readableBytes()));
-                        // 只要凑够了 8 字节,或者文件已经传输完毕(应对极小文件)
-                        if (headerCollector.readableBytes() == 8 || fileUpload.isCompleted()) {
-                            // TODO 将检测到的 contentType 和 channel 要求的 contentType 对比, 不符合则立即终止上传
-                            String sniffedType = tika.detect(toByteArray(headerCollector));
-                            formData.put("sniffedContentType", sniffedType);
-                            isTypeIdentified = true;
-                            // 释放 collector 内存
-                            headerCollector.clear();
-                        }
-                    }
-                    // 注意:这里需要重置 chunk 的 readerIndex,因为 writeBytes 移动了它
-                    // 或者在前面使用 slice()/duplicate() 保持 chunk 原封不动写磁盘
-                    chunk.readerIndex(0);
-                }
-
                 // 关键点:在移动指针前,先更新摘要
                 // 使用 nioBuffer() 不会移动 readerIndex,方便后续写入
                 ByteBuffer nioBuffer = chunk.nioBuffer();
@@ -260,12 +234,12 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
             // 单文件上传
             String uploadId = singleFileId;
             String filename = formData.get("filename");
-            String contentType = formData.get("sniffedContentType");
             String clientSha256sum = formData.get("clientSha256sum");
             String sha256sum = formData.get("sha256sum");
 
             String ssdTempPath = diskService.getSsdTempPath(uploadId);
             File tmpFile = new File(ssdTempPath);
+            String contentType = FileUtil.getContentType(ssdTempPath);
             long size = tmpFile.length();
             String objectName = getObjectName(uploadId, filename, contentType, channelPrefix);
             UploadResult uploadResult = new UploadResult();
@@ -306,7 +280,7 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
             String ssdTempPath = diskService.getSsdTempPath(uploadId);
             File tmpFile = new File(ssdTempPath);
             long size = tmpFile.length();
-            String contentType = getContentType(tmpFile);
+            String contentType = FileUtil.getContentType(ssdTempPath);
             String objectName = getObjectName(uploadId, filename, contentType, channelPrefix);
             String clientSha256sum = formData.get("clientSha256sum");
 
@@ -351,25 +325,11 @@ public class OssMultipartUploadHandler extends SimpleChannelInboundHandler<HttpO
         return String.format("%s%s.%s", channelPrefix, uploadId, suffix);
     }
 
-    private String getContentType(File file) {
-        try {
-            // Tika 会根据文件头字节进行深度探测
-            return tika.detect(file);
-        } catch (IOException e) {
-            log.error("Tika 探测文件类型失败: {}", file.getAbsolutePath(), e);
-            return "application/octet-stream";
-        }
-    }
-
     private void reset() {
         cleanDecoder();
         if (messageDigest != null) {
             messageDigest.reset();
         }
-
-        if (headerCollector.refCnt() > 0) {
-            headerCollector.release();
-        }
     }
 
     private void cleanDecoder() {

+ 2 - 4
oss-store/src/main/java/cn/reghao/oss/store/handler/OssUploadHandler.java

@@ -5,7 +5,7 @@ import cn.reghao.oss.api.dto.rest.FastUploadResult;
 import cn.reghao.oss.api.dto.rest.UploadResult;
 import cn.reghao.oss.api.dto.rest.UploadFileRet;
 import cn.reghao.oss.api.iface.ConsoleService;
-import cn.reghao.oss.api.util.FileUtil;
+import cn.reghao.oss.store.util.FileUtil;
 import cn.reghao.oss.store.config.OssStoreConfig;
 import cn.reghao.oss.store.config.StorageConstants;
 import cn.reghao.oss.store.disk.HddFlushService;
@@ -19,8 +19,6 @@ import io.netty.util.ReferenceCounted;
 import lombok.extern.slf4j.Slf4j;
 
 import java.nio.charset.StandardCharsets;
-import java.nio.file.Files;
-import java.nio.file.Paths;
 import java.util.UUID;
 
 /**
@@ -147,9 +145,9 @@ public class OssUploadHandler extends SimpleChannelInboundHandler<HttpObject> {
             String channelPrefix = (String) ctx.channel().attr(AttributeKey.valueOf("channelPrefix")).get();
             long uploadBy = (Long) ctx.channel().attr(AttributeKey.valueOf("uploadBy")).get();
 
-            String contentType = state.getMimeType(); // 获取识别到的类型
             String sha256sum = state.getSha256();
             String ssdTempPath = state.getTempPath();
+            String contentType = FileUtil.getContentType(ssdTempPath);
             String filename = state.getFilename();
             long size = state.getTotalSize();
             String uploadId = state.getObjectId();

+ 22 - 1
oss-api/src/main/java/cn/reghao/oss/api/util/FileUtil.java → oss-store/src/main/java/cn/reghao/oss/store/util/FileUtil.java

@@ -1,10 +1,31 @@
-package cn.reghao.oss.api.util;
+package cn.reghao.oss.store.util;
+
+import lombok.extern.slf4j.Slf4j;
+import org.apache.tika.Tika;
+import org.apache.tika.io.TikaInputStream;
+import org.apache.tika.metadata.Metadata;
+
+import java.nio.file.Path;
 
 /**
  * @author reghao
  * @date 2026-04-28 16:07:27
  */
+@Slf4j
 public class FileUtil {
+    private static final Tika tika = new Tika();
+
+    public static String getContentType(String filePath) {
+        String contentType = "application/octet-stream";
+        try (TikaInputStream tikaStream = TikaInputStream.get(Path.of(filePath))) {
+            contentType = tika.detect(tikaStream, new Metadata());
+        } catch (Exception e) {
+            log.error(e.getMessage());
+        }
+
+        return contentType;
+    }
+
     /**
      * 从文件名中提取后缀名
      *

+ 0 - 17
oss-store/src/main/java/cn/reghao/oss/store/util/UploadState.java

@@ -4,7 +4,6 @@ import io.netty.buffer.ByteBuf;
 import lombok.Getter;
 import lombok.Setter;
 import lombok.extern.slf4j.Slf4j;
-import org.apache.tika.Tika;
 
 import java.io.FileOutputStream;
 import java.io.IOException;
@@ -20,10 +19,6 @@ import java.nio.ByteBuffer;
 @Setter
 @Getter
 public class UploadState {
-    private static final Tika TIKA = new Tika();
-    private String mimeType = "application/octet-stream";
-    private boolean mimeDetected = false;
-
     private final String objectId;
     private final String tempPath;
     private final FileOutputStream fos;
@@ -40,18 +35,6 @@ public class UploadState {
     }
 
     public void appendData(ByteBuf buf) throws IOException {
-        // 1. 如果还没探测过 MIME,且当前有数据
-        if (!mimeDetected && buf.readableBytes() > 0) {
-            // Tika 只需要前 8KB 左右就能准确认出绝大多数格式
-            int length = Math.min(buf.readableBytes(), 8192);
-            byte[] header = new byte[length];
-            // 使用 getBytes 不会改变 ByteBuf 的 readerIndex,不影响后续写入
-            buf.getBytes(buf.readerIndex(), header);
-
-            this.mimeType = TIKA.detect(header);
-            this.mimeDetected = true;
-        }
-
         int length = buf.readableBytes();
         // 更新 SHA-256
         ByteBuffer nioBuffer = buf.nioBuffer();