001/*
002 * SPDX-License-Identifier: Apache-2.0
003 *
004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 *     https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package dev.enola.common.xml;
019
020import dev.enola.common.io.resource.ReadableResource;
021import dev.enola.common.io.resource.WritableResource;
022
023import org.w3c.dom.Document;
024import org.w3c.dom.Node;
025import org.w3c.dom.NodeList;
026import org.xml.sax.InputSource;
027import org.xml.sax.SAXException;
028
029import java.io.IOException;
030import java.io.InputStream;
031import java.io.StringWriter;
032
033import javax.xml.XMLConstants;
034import javax.xml.parsers.DocumentBuilder;
035import javax.xml.parsers.DocumentBuilderFactory;
036import javax.xml.parsers.ParserConfigurationException;
037import javax.xml.transform.*;
038import javax.xml.transform.dom.DOMSource;
039import javax.xml.transform.stream.StreamResult;
040
041public final class XML {
042
043    // TODO Should also order attributes of all elements alphabetically
044
045    // TODO Could #later re-implement this with StAX or SAX instead of DOM, for less memory use? But
046    // DOM is just easier to manipulate in code - and StAX is a PITA lacking a built-in indenting
047    // formatter.
048
049    public static void canonicalize(ReadableResource in, WritableResource out, boolean format)
050            throws IOException {
051        try (var inputStream = in.byteSource().openBufferedStream()) {
052            out.charSink().write(normalizeXML(inputStream, format));
053        } catch (ParserConfigurationException | SAXException | TransformerException e) {
054            throw new IOException("XML Error: " + in, e);
055        }
056    }
057
058    private static String normalizeXML(InputStream inputStream, boolean format)
059            throws ParserConfigurationException, IOException, SAXException, TransformerException {
060
061        // TODO Use streaming SAX instead of DOM; and break this up... use XmlResourceParser
062
063        DocumentBuilderFactory factory = DocumentBuilderFactory.newInstance();
064        factory.setFeature(XMLConstants.FEATURE_SECURE_PROCESSING, true); // #security
065        DocumentBuilder builder = factory.newDocumentBuilder();
066
067        Document document = builder.parse(new InputSource(inputStream));
068
069        normalizeWhitespace(document.getDocumentElement());
070
071        TransformerFactory transformerFactory = TransformerFactory.newInstance();
072        Transformer transformer = transformerFactory.newTransformer();
073
074        transformer.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
075        if (format) transformer.setOutputProperty(OutputKeys.INDENT, "yes");
076
077        StringWriter writer = new StringWriter();
078        transformer.transform(new DOMSource(document), new StreamResult(writer));
079        return writer.toString();
080    }
081
082    private static void normalizeWhitespace(Node node) {
083        if (node.getNodeType() == Node.TEXT_NODE) {
084            node.setTextContent(node.getTextContent().trim());
085        }
086
087        NodeList childNodes = node.getChildNodes();
088        for (int i = 0; i < childNodes.getLength(); i++) {
089            normalizeWhitespace(childNodes.item(i));
090        }
091    }
092
093    private XML() {}
094}