读取MultipartFile类型的word文件,并提取文件具体内容

public static void parseWord(MultipartFile file) throws ParseException, IOException {
          
   
        String buffer = "";
        try {
          
   
            if (file.getOriginalFilename().endsWith(".doc")) {
          
   
                InputStream stream = file.getInputStream();
                WordExtractor ex = new WordExtractor(stream);
                buffer = ex.getText();
                stream.close();
            } else if (file.getOriginalFilename().endsWith("docx")) {
          
   
                InputStream stream = file.getInputStream();
                XWPFDocument document = new XWPFDocument(stream);
                XWPFWordExtractor xwpfWordExtractor = new XWPFWordExtractor(document);
                buffer = xwpfWordExtractor.getText();
                stream.close();
            } else {
          
   
                System.out.println("此文件不是word文件!");
            }

        } catch (Exception e) {
          
   
            e.printStackTrace();
        }
        // 截取文件内容
		// 文件内容(例如获取开始时间 文件内容:工作开始时间: ___1999___年_9___月__9__日__0__时__0__分 结束时间)
        String startTimeLine = buffer.substring(buffer.indexOf("开始时间"),buffer.indexOf("结束时间"));
        String searchContent ="始时间:";
        int startTimeStart = startTimeLine.indexOf(searchContent);
        int startTimeEnd = startTimeLine.indexOf("结束");
        String startTimeStr = startTimeLine.substring(startTimeStart+searchContent.length(),startTimeEnd).trim();
        startTimeStr = startTimeStr.replaceAll(" ", "");
        startTimeStr = startTimeStr.replaceAll("(?:年|月)", "-");
        startTimeStr = startTimeStr.replaceAll("(?:日)", " ");
        startTimeStr = startTimeStr.replaceAll("(?:时)", ":");
        startTimeStr = startTimeStr.replaceAll("(?:分)", "");
        startTimeStr = startTimeStr.replaceAll("(?:_|—)", "");
}

思路:先把上传的MultipartFile类型的word文件解析,然后从解析出来的String字符串提取文件内容,这里使用截取方法

经验分享 程序员 微信小程序 职场和发展