001/* 002 * SPDX-License-Identifier: Apache-2.0 003 * 004 * Copyright 2024-2026 The Enola <https://enola.dev> Authors 005 * 006 * Licensed under the Apache License, Version 2.0 (the "License"); 007 * you may not use this file except in compliance with the License. 008 * You may obtain a copy of the License at 009 * 010 * https://www.apache.org/licenses/LICENSE-2.0 011 * 012 * Unless required by applicable law or agreed to in writing, software 013 * distributed under the License is distributed on an "AS IS" BASIS, 014 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 015 * See the License for the specific language governing permissions and 016 * limitations under the License. 017 */ 018package dev.enola.format.tika; 019 020import com.google.common.collect.*; 021import com.google.common.io.ByteSource; 022import com.google.common.net.MediaType; 023 024import dev.enola.common.io.mediatype.MediaTypeProvider; 025 026import org.apache.tika.detect.DefaultDetector; 027import org.apache.tika.metadata.Metadata; 028import org.apache.tika.metadata.TikaCoreProperties; 029import org.apache.tika.mime.MimeTypeException; 030import org.apache.tika.mime.MimeTypes; 031import org.apache.tika.parser.AutoDetectParser; 032import org.slf4j.Logger; 033import org.slf4j.LoggerFactory; 034 035import java.io.IOException; 036import java.util.Map; 037import java.util.Set; 038 039public class TikaMediaTypeProvider implements MediaTypeProvider { 040 041 private static final Set<String> EXCLUDED = 042 ImmutableSet.of( 043 // GV conflicts with our GraphvizMediaType (which has UTF_8; Tika's does not) 044 ".gv"); 045 046 private static final Logger LOG = LoggerFactory.getLogger(TikaMediaTypeProvider.class); 047 private static final DefaultDetector tika = new DefaultDetector(); 048 private final Map<MediaType, Set<MediaType>> knownTypesWithAlternatives; 049 private final Multimap<String, MediaType> extensionsToTypes; 050 051 public TikaMediaTypeProvider() { 052 // NB: Similar code in TikaMediaTypesThingConverter 053 var tikaMimeTypes = MimeTypes.getDefaultMimeTypes(); 054 var tikaMediaTypeRegistry = new AutoDetectParser().getMediaTypeRegistry(); 055 var tikaMediaTypes = tikaMediaTypeRegistry.getTypes(); 056 var n = tikaMediaTypes.size(); 057 var knownTypesWithAlternativesBuilder = 058 ImmutableMap.<MediaType, Set<MediaType>>builderWithExpectedSize(n); 059 var extensionsToTypesBuilder = ImmutableSetMultimap.<String, MediaType>builder(); 060 for (var tikaMediaType : tikaMediaTypes) { 061 // TODO Transform tikaMediaTypeRegistry super & child types into alternatives? 062 var alt = ImmutableSet.<MediaType>of(); 063 var guavaMediaType = TikaMediaTypes.toGuava(tikaMediaType); 064 knownTypesWithAlternativesBuilder.put(guavaMediaType, alt); 065 066 var mediaTypeName = tikaMediaType.toString(); 067 try { 068 var tikaMimeType = tikaMimeTypes.getRegisteredMimeType(mediaTypeName); 069 if (tikaMimeType == null) continue; 070 for (var additionalExtension : tikaMimeType.getExtensions()) { 071 if (EXCLUDED.contains(additionalExtension)) continue; 072 // TODO This is probably not actually required? Even wrong?? 073 if (!additionalExtension.startsWith(".")) 074 additionalExtension = "." + additionalExtension; 075 extensionsToTypesBuilder.put(additionalExtension, guavaMediaType); 076 } 077 } catch (MimeTypeException e) { 078 LOG.warn("MediaType not found: {}", mediaTypeName, e); 079 } 080 } 081 knownTypesWithAlternatives = knownTypesWithAlternativesBuilder.build(); 082 extensionsToTypes = extensionsToTypesBuilder.build(); 083 } 084 085 @Override 086 public Map<MediaType, Set<MediaType>> knownTypesWithAlternatives() { 087 return knownTypesWithAlternatives; 088 } 089 090 @Override 091 public Multimap<String, MediaType> extensionsToTypes() { 092 return extensionsToTypes; 093 } 094 095 @Override 096 public MediaType detect(String uri, ByteSource byteSource, MediaType original) { 097 for (var excluded : EXCLUDED) if (uri.endsWith(excluded)) return original; 098 099 var metadata = new Metadata(); 100 metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, uri); 101 metadata.set(Metadata.CONTENT_TYPE, original.toString()); 102 103 try (var is = byteSource.openBufferedStream()) { 104 var mediaType = TikaMediaTypes.toGuava(tika.detect(is, metadata)); 105 return mediaType; 106 } catch (IOException e) { 107 LOG.debug("IOException for {},", uri, e); 108 return original; 109 } 110 } 111}