001/* 002 * SPDX-License-Identifier: Apache-2.0 003 * 004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package dev.enola.common.xml; 019 020import dev.enola.common.io.resource.ReadableResource; 021import dev.enola.common.io.resource.WritableResource; 022 023import org.w3c.dom.Document; 024import org.w3c.dom.Node; 025import org.w3c.dom.NodeList; 026import org.xml.sax.InputSource; 027import org.xml.sax.SAXException; 028 029import java.io.IOException; 030import java.io.InputStream; 031import java.io.StringWriter; 032 033import javax.xml.XMLConstants; 034import javax.xml.parsers.DocumentBuilder; 035import javax.xml.parsers.DocumentBuilderFactory; 036import javax.xml.parsers.ParserConfigurationException; 037import javax.xml.transform.*; 038import javax.xml.transform.dom.DOMSource; 039import javax.xml.transform.stream.StreamResult; 040 041public final class XML { 042 043 // TODO Should also order attributes of all elements alphabetically 044 045 // TODO Could #later re-implement this with StAX or SAX instead of DOM, for less memory use? But 046 // DOM is just easier to manipulate in code - and StAX is a PITA lacking a built-in indenting 047 // formatter. 048 049 public static void canonicalize(ReadableResource in, WritableResource out, boolean format) 050 throws IOException { 051 try (var inputStream = in.byteSource().openBufferedStream()) { 052 out.charSink().write(normalizeXML(inputStream, format)); 053 } catch (ParserConfigurationException | SAXException | TransformerException e) { 054 throw new IOException("XML Error: " + in, e); 055 } 056 } 057 058 private static String normalizeXML(InputStream inputStream, boolean format) 059 throws ParserConfigurationException, IOException, SAXException, TransformerException { 060 061 // TODO Use streaming SAX instead of DOM; and break this up... use XmlResourceParser 062 063 DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance(); 064 factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); // #security 065 DocumentBuilder builder = factory.newDocumentBuilder(); 066 067 Document document = builder.parse(new InputSource(inputStream)); 068 069 normalizeWhitespace(document.getDocumentElement()); 070 071 TransformerFactory transformerFactory = TransformerFactory.newInstance(); 072 Transformer transformer = transformerFactory.newTransformer(); 073 074 transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8"); 075 if (format) transformer.setOutputProperty(OutputKeys.INDENT, "yes"); 076 077 StringWriter writer = new StringWriter(); 078 transformer.transform(new DOMSource(document), new StreamResult(writer)); 079 return writer.toString(); 080 } 081 082 private static void normalizeWhitespace(Node node) { 083 if (node.getNodeType() == Node.TEXT_NODE) { 084 node.setTextContent(node.getTextContent().trim()); 085 } 086 087 NodeList childNodes = node.getChildNodes(); 088 for (int i = 0; i < childNodes.getLength(); i++) { 089 normalizeWhitespace(childNodes.item(i)); 090 } 091 } 092 093 private XML() {} 094}