source: josm/trunk/src/com/drew/metadata/iptc/Iso2022Converter.java@ 13500

Last change on this file since 13500 was 13061, checked in by Don-vip, 6 years ago

fix #15505 - update to metadata-extractor 2.10.1

File size: 3.7 KB
Line 
1/*
2 * Copyright 2002-2017 Drew Noakes
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 *
16 * More information about this project is available at:
17 *
18 * https://drewnoakes.com/code/exif/
19 * https://github.com/drewnoakes/metadata-extractor
20 */
21package com.drew.metadata.iptc;
22
23import com.drew.lang.annotations.NotNull;
24import com.drew.lang.annotations.Nullable;
25
26import java.nio.ByteBuffer;
27import java.nio.charset.CharacterCodingException;
28import java.nio.charset.Charset;
29import java.nio.charset.CharsetDecoder;
30
31public final class Iso2022Converter
32{
33 private static final String ISO_8859_1 = "ISO-8859-1";
34 private static final String UTF_8 = "UTF-8";
35
36 private static final byte LATIN_CAPITAL_A = 0x41;
37 private static final int DOT = 0xe280a2;
38 private static final byte LATIN_CAPITAL_G = 0x47;
39 private static final byte PERCENT_SIGN = 0x25;
40 private static final byte ESC = 0x1B;
41
42 /**
43 * Converts the given ISO2022 char set to a Java charset name.
44 *
45 * @param bytes string data encoded using ISO2022
46 * @return the Java charset name as a string, or <code>null</code> if the conversion was not possible
47 */
48 @Nullable
49 public static String convertISO2022CharsetToJavaCharset(@NotNull final byte[] bytes)
50 {
51 if (bytes.length > 2 && bytes[0] == ESC && bytes[1] == PERCENT_SIGN && bytes[2] == LATIN_CAPITAL_G)
52 return UTF_8;
53
54 if (bytes.length > 3 && bytes[0] == ESC && (bytes[3] & 0xFF | ((bytes[2] & 0xFF) << 8) | ((bytes[1] & 0xFF) << 16)) == DOT && bytes[4] == LATIN_CAPITAL_A)
55 return ISO_8859_1;
56
57 return null;
58 }
59
60 /**
61 * Attempts to guess the {@link Charset} of a string provided as a byte array.
62 * <p>
63 * Charsets trialled are, in order:
64 * <ul>
65 * <li>UTF-8</li>
66 * <li><code>System.getProperty("file.encoding")</code></li>
67 * <li>ISO-8859-1</li>
68 * </ul>
69 * <p>
70 * Its only purpose is to guess the Charset if and only if IPTC tag coded character set is not set. If the
71 * encoding is not UTF-8, the tag should be set. Otherwise it is bad practice. This method tries to
72 * workaround this issue since some metadata manipulating tools do not prevent such bad practice.
73 * <p>
74 * About the reliability of this method: The check if some bytes are UTF-8 or not has a very high reliability.
75 * The two other checks are less reliable.
76 *
77 * @param bytes some text as bytes
78 * @return the name of the encoding or null if none could be guessed
79 */
80 @Nullable
81 static Charset guessCharSet(@NotNull final byte[] bytes)
82 {
83 String[] encodings = { UTF_8, System.getProperty("file.encoding"), ISO_8859_1 };
84
85 for (String encoding : encodings)
86 {
87 Charset charset = Charset.forName(encoding);
88 CharsetDecoder cs = charset.newDecoder();
89
90 try {
91 cs.decode(ByteBuffer.wrap(bytes));
92 return charset;
93 } catch (CharacterCodingException e) {
94 // fall through...
95 }
96 }
97
98 // No encodings succeeded. Return null.
99 return null;
100 }
101
102 private Iso2022Converter()
103 {}
104}
Note: See TracBrowser for help on using the repository browser.