001/*
002 * SPDX-License-Identifier: Apache-2.0
003 *
004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 *     https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package dev.enola.model.enola.mediatype;
019
020import com.google.common.base.Strings;
021
022import dev.enola.common.convert.ConversionException;
023import dev.enola.common.io.mediatype.MediaTypes;
024import dev.enola.format.tika.TikaMediaTypes;
025import dev.enola.thing.io.UriIntoThingConverter;
026import dev.enola.thing.repo.ThingRepositoryStore;
027
028import org.apache.tika.mime.MimeTypeException;
029import org.apache.tika.mime.MimeTypes;
030import org.apache.tika.parser.AutoDetectParser;
031import org.slf4j.Logger;
032import org.slf4j.LoggerFactory;
033
034import java.io.IOException;
035import java.net.URI;
036
037/**
038 * Converts Tika Media Types into Enola Things (AKA RDF / TTL).
039 *
040 * <p>Tika reads these from its (builtin) <a
041 * href="https://github.com/apache/tika/blob/116edb30dc5fd26770216ccffcf873f4952a5c2a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml">MIME
042 * Types XML</a> file which corresponds to the <a
043 * href="https://freedesktop.org/wiki/Specifications/shared-mime-info-spec/">freedesktop.org MIME
044 * Info Spec</a>.
045 */
046public class TikaMediaTypesThingConverter implements UriIntoThingConverter {
047
048    // TODO Make https://enola.dev/fileExtensions be links, not text?
049
050    private static final Logger LOG = LoggerFactory.getLogger(TikaMediaTypesThingConverter.class);
051
052    public static final URI IRI = URI.create("enola:TikaMediaTypes");
053
054    @Override
055    public boolean convertInto(URI from, ThingRepositoryStore into)
056            throws ConversionException, IOException {
057        if (!IRI.equals(from)) return false;
058
059        // NB: Similar code in TikaMediaTypeProvider
060        var tikaMimeTypes = MimeTypes.getDefaultMimeTypes();
061        var tikaMediaTypeRegistry = new AutoDetectParser().getMediaTypeRegistry();
062        var tikaMediaTypes = tikaMediaTypeRegistry.getTypes();
063        for (var tikaMediaType : tikaMediaTypes) {
064            var mediaTypeName = tikaMediaType.toString();
065            try {
066                var tikaMimeType = tikaMimeTypes.getRegisteredMimeType(mediaTypeName);
067                var iri = toIRI(tikaMediaType);
068                MediaType.Builder thing =
069                        into.getBuilder(iri, MediaType.class, MediaType.Builder.class);
070                thing.addType("https://enola.dev/MediaType");
071                thing.mediaType(tikaMimeType.getName());
072
073                var label = tikaMimeType.getAcronym();
074                if (!Strings.isNullOrEmpty(label)) thing.label(label);
075                else thing.label(tikaMimeType.getName());
076
077                thing.comment(tikaMimeType.getDescription());
078                thing.addAllFileExtensions(tikaMimeType.getExtensions());
079                thing.addAllSeeAlso(
080                        tikaMimeType.getLinks().stream().map(uri -> uri.toString()).toList());
081
082                // TODO var uniformTypeIdentifier = tikaMimeType.getUniformTypeIdentifier();
083                // TODO var hasMagic = tikaMimeType.hasMagic();
084
085                // TODO Tika hard-codes :( a few special cases, and doesn't e.g. do +json...
086                var superType = tikaMediaTypeRegistry.getSupertype(tikaMediaType);
087                if (superType != null) thing.parentIRI(toIRI(superType));
088
089                // tikaMediaType.getBaseType() is a superset of getSupertype()
090
091                // TODO Making the following a 1 liner...
092                // TODO Remove this once children are automagically set by generic Inference!!
093                // TODO Uncomment, once GraphvizGenerator more nicely coalesces parent & children
094                /*
095                var children = tikaMediaTypeRegistry.getChildTypes(tikaMediaType);
096                if (!children.isEmpty()) {
097                    var childrenIRI = ImmutableSet.<Link>builderWithExpectedSize(children.size());
098                    for (var child : children) {
099                        childrenIRI.add(new Link(toIRI(child)));
100                    }
101                    thing.childrenIRI(childrenIRI.build());
102                }
103                */
104
105                into.store(thing.build());
106
107            } catch (MimeTypeException e) {
108                LOG.warn("MediaType not found: {}", mediaTypeName, e);
109            }
110        }
111        return true;
112    }
113
114    private String toIRI(org.apache.tika.mime.MediaType tikaMediaType) {
115        return MediaTypes.toIRI(TikaMediaTypes.toGuava(tikaMediaType));
116    }
117}