001/* 002 * SPDX-License-Identifier: Apache-2.0 003 * 004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package dev.enola.common.html; 019 020import dev.enola.common.io.iri.URIs; 021import dev.enola.common.io.resource.ReadableResource; 022 023import org.jsoup.Jsoup; 024import org.jsoup.nodes.Document; 025import org.jsoup.nodes.Entities; 026 027import java.io.IOException; 028import java.nio.charset.Charset; 029 030/** 031 * HTML utilities. 032 * 033 * <p>Note that HTML to Thing conversion is done by TikaThingConverter, not here. 034 */ 035public final class HTML { 036 037 // TODO canonicalize JS inside <script> 038 // TODO canonicalize CSS inside <style> 039 040 public static String canonicalize(ReadableResource html, Charset outCharset, boolean format) 041 throws IOException { 042 var baseURI = URIs.getBase(html.uri()); 043 var dom = read(html); 044 var outputSetting = new Document.OutputSettings(); 045 outputSetting.charset(outCharset); 046 outputSetting.escapeMode(Entities.EscapeMode.xhtml); 047 outputSetting.maxPaddingWidth(-1); 048 if (format) { 049 outputSetting.outline(true); 050 outputSetting.indentAmount(2); 051 } else { 052 outputSetting.outline(false); 053 outputSetting.indentAmount(0); 054 } 055 outputSetting.prettyPrint(true); 056 dom.outputSettings(outputSetting); 057 return dom.html(); 058 } 059 060 public static Document read(ReadableResource html) throws IOException { 061 var baseURI = URIs.getBase(html.uri()); 062 try (var is = html.byteSource().openBufferedStream()) { 063 return Jsoup.parse(is, null, baseURI.toString()); 064 } 065 } 066 067 private HTML() {} 068}