001/*
002 * SPDX-License-Identifier: Apache-2.0
003 *
004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors
005 *
006 * Licensed under the Apache License, Version 2.0 (the "License");
007 * you may not use this file except in compliance with the License.
008 * You may obtain a copy of the License at
009 *
010 *     https://www.apache.org/licenses/LICENSE-2.0
011 *
012 * Unless required by applicable law or agreed to in writing, software
013 * distributed under the License is distributed on an "AS IS" BASIS,
014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
015 * See the License for the specific language governing permissions and
016 * limitations under the License.
017 */
018package dev.enola.format.tika;
019
020import com.google.common.collect.*;
021import com.google.common.io.ByteSource;
022import com.google.common.net.MediaType;
023
024import dev.enola.common.io.mediatype.MediaTypeProvider;
025
026import org.apache.tika.detect.DefaultDetector;
027import org.apache.tika.metadata.Metadata;
028import org.apache.tika.metadata.TikaCoreProperties;
029import org.apache.tika.mime.MimeTypeException;
030import org.apache.tika.mime.MimeTypes;
031import org.apache.tika.parser.AutoDetectParser;
032import org.slf4j.Logger;
033import org.slf4j.LoggerFactory;
034
035import java.io.IOException;
036import java.util.Map;
037import java.util.Set;
038
039public class TikaMediaTypeProvider implements MediaTypeProvider {
040
041    private static final Set<String> EXCLUDED =
042            ImmutableSet.of(
043                    // GV conflicts with our GraphvizMediaType (which has UTF_8; Tika's does not)
044                    ".gv");
045
046    private static final Logger LOG = LoggerFactory.getLogger(TikaMediaTypeProvider.class);
047    private static final DefaultDetector tika = new DefaultDetector();
048    private final Map<MediaType, Set<MediaType>> knownTypesWithAlternatives;
049    private final Multimap<String, MediaType> extensionsToTypes;
050
051    public TikaMediaTypeProvider() {
052        // NB: Similar code in TikaMediaTypesThingConverter
053        var tikaMimeTypes = MimeTypes.getDefaultMimeTypes();
054        var tikaMediaTypeRegistry = new AutoDetectParser().getMediaTypeRegistry();
055        var tikaMediaTypes = tikaMediaTypeRegistry.getTypes();
056        var n = tikaMediaTypes.size();
057        var knownTypesWithAlternativesBuilder =
058                ImmutableMap.<MediaType, Set<MediaType>>builderWithExpectedSize(n);
059        var extensionsToTypesBuilder = ImmutableSetMultimap.<String, MediaType>builder();
060        for (var tikaMediaType : tikaMediaTypes) {
061            // TODO Transform tikaMediaTypeRegistry super & child types into alternatives?
062            var alt = ImmutableSet.<MediaType>of();
063            var guavaMediaType = TikaMediaTypes.toGuava(tikaMediaType);
064            knownTypesWithAlternativesBuilder.put(guavaMediaType, alt);
065
066            var mediaTypeName = tikaMediaType.toString();
067            try {
068                var tikaMimeType = tikaMimeTypes.getRegisteredMimeType(mediaTypeName);
069                if (tikaMimeType == null) continue;
070                for (var additionalExtension : tikaMimeType.getExtensions()) {
071                    if (EXCLUDED.contains(additionalExtension)) continue;
072                    // TODO This is probably not actually required? Even wrong??
073                    if (!additionalExtension.startsWith("."))
074                        additionalExtension = "." + additionalExtension;
075                    extensionsToTypesBuilder.put(additionalExtension, guavaMediaType);
076                }
077            } catch (MimeTypeException e) {
078                LOG.warn("MediaType not found: {}", mediaTypeName, e);
079            }
080        }
081        knownTypesWithAlternatives = knownTypesWithAlternativesBuilder.build();
082        extensionsToTypes = extensionsToTypesBuilder.build();
083    }
084
085    @Override
086    public Map<MediaType, Set<MediaType>> knownTypesWithAlternatives() {
087        return knownTypesWithAlternatives;
088    }
089
090    @Override
091    public Multimap<String, MediaType> extensionsToTypes() {
092        return extensionsToTypes;
093    }
094
095    @Override
096    public MediaType detect(String uri, ByteSource byteSource, MediaType original) {
097        for (var excluded : EXCLUDED) if (uri.endsWith(excluded)) return original;
098
099        var metadata = new Metadata();
100        metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, uri);
101        metadata.set(Metadata.CONTENT_TYPE, original.toString());
102
103        try (var is = byteSource.openBufferedStream()) {
104            var mediaType = TikaMediaTypes.toGuava(tika.detect(is, metadata));
105            return mediaType;
106        } catch (IOException e) {
107            LOG.debug("IOException for {},", uri, e);
108            return original;
109        }
110    }
111}