001/*
002 * SPDX-License-Identifier: Apache-2.0
003 *
004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 *     https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package dev.enola.common.html;
019
020import dev.enola.common.io.iri.URIs;
021import dev.enola.common.io.resource.ReadableResource;
022
023import org.jsoup.Jsoup;
024import org.jsoup.nodes.Document;
025import org.jsoup.nodes.Entities;
026
027import java.io.IOException;
028import java.nio.charset.Charset;
029
030/**
031 * HTML utilities.
032 *
033 * <p>Note that HTML to Thing conversion is done by TikaThingConverter, not here.
034 */
035public final class HTML {
036
037    // TODO canonicalize JS inside <script>
038    // TODO canonicalize CSS inside <style>
039
040    public static String canonicalize(ReadableResource html, Charset outCharset, boolean format)
041            throws IOException {
042        var baseURI = URIs.getBase(html.uri());
043        var dom = read(html);
044        var outputSetting = new Document.OutputSettings();
045        outputSetting.charset(outCharset);
046        outputSetting.escapeMode(Entities.EscapeMode.xhtml);
047        outputSetting.maxPaddingWidth(-1);
048        if (format) {
049            outputSetting.outline(true);
050            outputSetting.indentAmount(2);
051        } else {
052            outputSetting.outline(false);
053            outputSetting.indentAmount(0);
054        }
055        outputSetting.prettyPrint(true);
056        dom.outputSettings(outputSetting);
057        return dom.html();
058    }
059
060    public static Document read(ReadableResource html) throws IOException {
061        var baseURI = URIs.getBase(html.uri());
062        try (var is = html.byteSource().openBufferedStream()) {
063            return Jsoup.parse(is, null, baseURI.toString());
064        }
065    }
066
067    private HTML() {}
068}