Java实现word,pdf转html并保留格式

更新时间：2025年07月16日 10:06:45 作者：xyyf

这篇文章主要为大家详细介绍了如何使用Java实现将word,pdf转换为html并保留格式,文中的示例代码讲解详细,感兴趣的小伙伴可以了解下

一、word转html

依赖：

<properties>
    <poi.version>5.2.3</poi.version>
    <xhtml.version>2.0.4</xhtml.version>
</properties>
 
<!--word转html-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
    <version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>${poi.version}</version>
</dependency>

代码：

import fr.opensagres.poi.xwpf.converter.xhtml.Base64EmbedImgManager;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLConverter;
import fr.opensagres.poi.xwpf.converter.xhtml.XHTMLOptions;
import org.apache.commons.codec.binary.Base64;
import org.apache.poi.hwpf.HWPFDocument;
import org.apache.poi.hwpf.converter.WordToHtmlConverter;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
 
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import java.io.*;
import java.net.URL;
 
public class WordUtil {
 
    public static String wordToHtml(String fileUrl,String fileSuffix) throws Exception {
        URL url = new URL(fileUrl);
        try (InputStream inputStream = url.openStream()) {
            if(fileSuffix.equals(".docx") || fileSuffix.equals(".DOCX")){
                return word2007ToHtml(inputStream);
            } else if (fileSuffix.equals(".doc") || fileSuffix.equals(".DOC")) {
                return word2003ToHtml(inputStream);
            }else{
                throw new RuntimeException("错误的文件后缀");
            }
        } catch (RuntimeException e) {
            throw new RuntimeException(e.getMessage());
        }
    }
 
    /**
     * word2007转换成html
     * 对于docx，可以用下面这种方式:
     * @throws Exception
     */
    public static String word2007ToHtml(InputStream inputStream) {
        try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
             XWPFDocument docxDocument = new XWPFDocument(inputStream)) {
            XHTMLOptions options = XHTMLOptions.create();
            // 是否忽略未使用的样式
            options.setIgnoreStylesIfUnused(false);
            // 设置片段模式，<div>标签包裹
            options.setFragment(true);
            // 图片转base64
            options.setImageManager(new Base64EmbedImgManager());
            // 转换htm1
            XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
            return htmlStream.toString();
        } catch (Exception e) {
            System.out.println("Word转Html过程出现异常！");
            throw new RuntimeException(e.getMessage());
        }
    }
    /**
     * word2003转换成html
     * 对于doc，可以用下面这种方式:
     * @throws Exception
     */
    public static String word2003ToHtml(InputStream inputStream ) throws Exception {
        try (StringWriter writer = new StringWriter();
             HWPFDocument document = new HWPFDocument(inputStream)) {
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //将图片转成base64的格式
            wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
            wordToHtmlConverter.processDocument(document);
            org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, new StreamResult(writer));
            return writer.toString();
        } catch (Exception e) {
            System.out.println("Word转Html过程出现异常！");
            throw new RuntimeException(e.getMessage());
        }
    }
 
}

二、pdf转html

依赖：

        <dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
        </dependency>
 
        <dependency>
            <groupId>net.mabboud.fontverter</groupId>
            <artifactId>FontVerter</artifactId>
        </dependency>
        <dependency>
            <groupId>org.reflections</groupId>
            <artifactId>reflections</artifactId>
        </dependency>
        <!--pdf转文本-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
        </dependency>

代码:

import org.apache.pdfbox.pdmodel.PDDocument;
import org.fit.pdfdom.PDFDomTree;
 
import java.io.*;
import java.net.URL;
 
public class PDFUtil {
    public static String pdfToHtml(String fileUrl) throws IOException {
        URL url = new URL(fileUrl);
        try (InputStream inputStream = url.openStream()){
            return pdfToHtml(inputStream);
        }catch (Exception e){
            throw new IOException(e.getMessage());
        }
    }
    public static String pdfToHtml(InputStream inputStream) throws IOException {
        String outFilePath = "mypdf.html";
        String pdfContent = "";
        PDDocument document = PDDocument.load(inputStream);
        Writer writer = new PrintWriter(outFilePath, "UTF-8");
        new PDFDomTree().writeText(document, writer);
        writer.close();
        document.close();
        // 获取html内容
        try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {
            StringBuilder htmlContent = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                htmlContent.append(line).append("\n"); // 追加每一行内容，并添加换行符
            }
            pdfContent = String.valueOf(htmlContent);
            return pdfContent;
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("读取 HTML 文件时出错。");
        }
        return null;
    }
}

三、方法补充

Java实现word转html

1.引入maven依赖

<properties>
    <poi.version>5.2.3</poi.version>
    <xhtml.version>2.0.4</xhtml.version>
</properties>

<!--word转html-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-scratchpad</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--word转html-->
<dependency>
    <groupId>fr.opensagres.xdocreport</groupId>
    <artifactId>fr.opensagres.poi.xwpf.converter.xhtml</artifactId>
    <version>${xhtml.version}</version>
</dependency>
<!--处理office文档表格相关 2007+版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi-ooxml</artifactId>
    <version>${poi.version}</version>
</dependency>
<!--处理office文档表格相关 2003版-->
<dependency>
    <groupId>org.apache.poi</groupId>
    <artifactId>poi</artifactId>
    <version>${poi.version}</version>
</dependency>

2.Java代码

    /**
     * Word2007(docx)格式转html
     * @param filePath 文件路径
     * @return 返回转成String类型的html字符串
     * @throws IOException
     */
    public static String docxToHtml(String filePath) {
        try (ByteArrayOutputStream htmlStream = new ByteArrayOutputStream();
             XWPFDocument docxDocument = new XWPFDocument(Files.newInputStream(Paths.get(filePath)))) {
            XHTMLOptions options = XHTMLOptions.create();
            // 是否忽略未使用的样式
            options.setIgnoreStylesIfUnused(false);
            // 设置片段模式，<div>标签包裹
            options.setFragment(true);
            // 图片转base64
            options.setImageManager(new Base64EmbedImgManager());
            // 转换htm1
            XHTMLConverter.getInstance().convert(docxDocument, htmlStream, options);
            return htmlStream.toString();
        } catch (Exception e) {
            log.error("Word转Html过程出现异常！", e);
        }
        return null;
    }


    /**
     * Word2003(doc)格式转html
     * @param filePath 文件路径
     * @return 返回转成String类型的html字符串
     * @throws Exception
     */
    public static String docToHtml(String filePath) {
        try (StringWriter writer = new StringWriter();
             HWPFDocument document = new HWPFDocument(Files.newInputStream(new File(filePath).toPath()))) {
            WordToHtmlConverter wordToHtmlConverter = new WordToHtmlConverter(DocumentBuilderFactory.newInstance().newDocumentBuilder().newDocument());
            //将图片转成base64的格式
            wordToHtmlConverter.setPicturesManager((bytes, pictureType, s, v, v1) -> "data:image/png;base64," + Base64.encodeBase64String(bytes));
            wordToHtmlConverter.processDocument(document);
            org.w3c.dom.Document htmlDocument = wordToHtmlConverter.getDocument();
            DOMSource domSource = new DOMSource(htmlDocument);
            TransformerFactory factory = TransformerFactory.newInstance();
            Transformer serializer = factory.newTransformer();
            serializer.setOutputProperty(OutputKeys.ENCODING, "utf-8");
            serializer.setOutputProperty(OutputKeys.INDENT, "yes");
            serializer.setOutputProperty(OutputKeys.METHOD, "html");
            serializer.transform(domSource, new StreamResult(writer));
            return writer.toString();
        } catch (Exception e) {
            log.error("Word转Html过程出现异常！", e);
        }
        return null;
    }

    /**
     * word 转 html
     * 自动检测文件格式转换
     * @param filePath 文件本地路径
     * @return 成功返回转换后的html字符串；失败返回null
     */
    public static String autoWord2Html(String filePath) {
        int lastIndexOf = filePath.lastIndexOf(".");
        String suffix = filePath.substring(lastIndexOf + 1);
        if ("doc".equalsIgnoreCase(suffix)) {
            return docToHtml(filePath);
        } else if ("docx".equalsIgnoreCase(suffix)) {
            return docxToHtml(filePath);
        } else {
            log.info("文件格式错误，只支持Docx和Doc格式的文档！");
            return null;
        }
    }

使用Java实现PDF到HTML的转换

引入以下依赖

<dependency>
            <groupId>net.sf.cssbox</groupId>
            <artifactId>pdf2dom</artifactId>
            <version>2.0.3</version>
        </dependency>
 
        <dependency>
            <groupId>net.mabboud.fontverter</groupId>
            <artifactId>FontVerter</artifactId>
            <version>1.2.22</version> <!-- 请根据需要使用最新版本 -->
        </dependency>
        <dependency>
            <groupId>org.reflections</groupId>
            <artifactId>reflections</artifactId>
            <version>0.10.2</version> <!-- 请根据需要使用最新版本 -->
        </dependency>
        <!--pdf转文本-->
        <dependency>
            <groupId>org.apache.pdfbox</groupId>
            <artifactId>pdfbox</artifactId>
            <version>2.0.24</version>
        </dependency>

实现关键代码

        File file = new File(pdfUrl);
        String localPdfFilePath = 要解析的PDF文件路径（本地）+ file.getName();
        String newPdfFilePath = 截取PDF后生成的PDF文件路径+ file.getName();
        String outFilePath = 生成的HTML文件.html";
        String pdfContent = "";
        PDDocument pdfDocument = PDDocument.load(new File(localPdfFilePath));
        // 检查文档中是否有页面
        if (pdfDocument.getNumberOfPages() > 0) {
            // 移除第一页
            pdfDocument.removePage(0);
        }
        // 保存更改后的PDF到新文件
        pdfDocument.save(new File(newPdfFilePath));
        System.out.println("第一页已被移除，新PDF保存在: " + newPdfFilePath);
        pdfDocument.close();
        // 转换成html格式文件
        PDDocument document = PDDocument.load(new File(newPdfFilePath));
        Writer writer = new PrintWriter(outFilePath, "UTF-8");
        new PDFDomTree().writeText(document, writer);
        writer.close();
        document.close();
        // 获取html内容
        try (BufferedReader reader = new BufferedReader(new FileReader(outFilePath))) {
            StringBuilder htmlContent = new StringBuilder();
            String line;
            while ((line = reader.readLine()) != null) {
                htmlContent.append(line).append("\n"); // 追加每一行内容，并添加换行符
            }
            pdfContent = String.valueOf(htmlContent);
        } catch (IOException e) {
            e.printStackTrace();
            System.err.println("读取 HTML 文件时出错。");
        }

到此这篇关于Java实现word,pdf转html并保留格式的文章就介绍到这了,更多相关Java word,pdf转html内容请搜索脚本之家以前的文章或继续浏览下面的相关文章希望大家以后多多支持脚本之家！

您可能感兴趣的文章:

Netty分布式Future与Promise执行回调相关逻辑剖析
这篇文章主要为大家介绍了Netty分布式Future与Promise执行回调相关逻辑剖析，有需要的朋友可以借鉴参考下，希望能够有所帮助，祝大家多多进步
2022-03-03
Java使用POI-TL和JFreeChart动态生成Word报告
本文介绍了使用POI-TL和JFreeChart生成包含动态数据和图表的Word报告的方法,并分享了实际开发中的踩坑经验,通过代码示例讲解的非常详细,具有一定的参考价值,需要的朋友可以参考下
2025-02-02
SpringBoot实现异步调用的方法示例
本文介绍了在Java的SpringBoot中实现异步请求和异步调用的几种方法,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧
2025-01-01
Java 关于时间复杂度和空间复杂度的深度刨析
算法复杂度分为时间复杂度和空间复杂度。其作用：时间复杂度是度量算法执行的时间长短；而空间复杂度是度量算法所需存储空间的大小
2021-11-11
Java图形界面之JFrame,JLabel,JButton详解
这篇文章主要介绍了Java图形界面之JFrame、JLabel、JButton详解,文中有非常详细的代码示例,对正在学习java的小伙伴们有非常好的帮助,需要的朋友可以参考下
2021-04-04
Java垃圾回收算法及GC触发条件解读
Java垃圾回收机制自动管理内存,减轻开发负担但存在性能开销与延迟问题,主流算法包括标记清除、复制、分代回收等,通过分代策略提升效率,触发条件涉及新生代与老年代,优化需调参、对象复用及减少临时对象,未来趋势为低延迟、大内存支持及智能化GC技术
2025-07-07
java 中Buffer源码的分析
这篇文章主要介绍了java 中Buffer源码的分析的相关资料,需要的朋友可以参考下
2017-06-06
Java中的for循环高级用法
本文系统解析Java中传统、增强型for循环、Stream API及并行流的实现原理与性能差异,并通过大量代码示例展示实际开发中的最佳实践,感兴趣的朋友一起看看吧
2025-06-06
Java SSL证书错误：No subject alternative name
本文深入解析Java SSL证书错误'No subject alternative names present'的解决方案,文中通过示例代码介绍的非常详细,对大家的学习或者工作具有一定的参考学习价值,需要的朋友们下面随着小编来一起学习学习吧
2026-03-03
Java线程中的关键字和方法示例详解
这篇文章主要介绍了Java有关线程中的关键字和方法,本文通过示例代码给大家介绍的非常详细，对大家的学习或工作具有一定的参考借鉴价值，需要的朋友可以参考下
2022-03-03