Jsoup操作HTML

Jsoup操作HTML

起男 17 2025-04-27

Jsoup操作HTML

依赖

        <dependency>
            <groupId>org.jsoup</groupId>
            <artifactId>jsoup</artifactId>
            <version>1.18.3</version>
        </dependency>

解析HTML

        String html = """
                <html>
                    <head>
                        <title>标题a</title>
                    </head>
                    <body>
                        <p>abcdefg</p>
                        <p>1234567</p>
                    </body>
                </html>
                """;
        Document document = Jsoup.parse(html);
        Elements elements = document.getElementsByTag("title");
        System.out.println(elements.text());
        
        Elements e1 = document.getElementsByTag("p");
        System.out.println(e1.text());

爬取网页内容

        //加载网页
        Connection connect = Jsoup.connect("http://www.dingqinan.com/");
        Document document = connect.get();
//        System.out.println(document);
        //获取网页title
        System.out.println(document.title());
        //获取所有链接
        Elements links = document.select("a[href]");
        for (Element link : links) {
            //标题
            System.out.println(link.text());
            //链接
            System.out.println(link.attr("href"));
        }
        //获取所有图片
        Elements imgs = document.select("img[src~=(?i)\\.(png|jpe?g|gif)]");
        for (Element img : imgs) {
            System.out.println(img.attr("data-src"));
        }

修改HTML元素

        String html = """
                <html>
                    <head>
                        <title>标题a</title>
                    </head>
                    <body>
                        <p>abcdefg</p>
                        <p class="xxhh">1234567</p>
                    </body>
                </html>
                """;
        Document document = Jsoup.parse(html);
        Elements elements = document.select("p.xxhh");
        elements.first().text("1111");
        System.out.println(document);

防止XSS攻击

        String a = "<p><a href='http://www.dqn.com' onclick='getCookies()'>提交</a></p>";
        String s = Jsoup.clean(a, Safelist.basic());
        System.out.println(s);

输出结果中的onclick事件被清楚了