/* * Copyright (c) 2011, 2024, Oracle and/or its affiliates. All rights reserved. * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. * * This code is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License version 2 only, as * published by the Free Software Foundation. Oracle designates this * particular file as subject to the "Classpath" exception as provided * by Oracle in the LICENSE file that accompanied this code. * * This code is distributed in the hope that it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License * version 2 for more details (a copy is included in the LICENSE file that * accompanied this code). * * You should have received a copy of the GNU General Public License version * 2 along with this work; if not, write to the Free Software Foundation, * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. * * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA * or visit www.oracle.com if you need additional information or have any * questions. */ package com.sun.tools.javac.parser; import java.util.Arrays; import com.sun.tools.javac.resources.CompilerProperties.Errors; import com.sun.tools.javac.util.Log; import static com.sun.tools.javac.util.LayoutCharacters.EOI; import static com.sun.tools.javac.util.LayoutCharacters.tabulate; /** * The unicode character reader used by the javac/javadoc lexer/tokenizer, returns characters * one by one as contained in the input stream, handling unicode escape sequences accordingly. * *

This is NOT part of any supported API. * If you write code that depends on this, you do so at your own risk. * This code and its internal interfaces are subject to change or * deletion without notice.

*/ public class UnicodeReader { /** * Buffer containing characters from source file. May contain extraneous characters * beyond this.length. */ private final char[] buffer; /** * Length of meaningful content in buffer. */ private final int length; /** * Virtual position offset in the original buffer. */ private final int offset; /** * Character buffer index of character currently being observed. */ private int position; /** * Number of characters combined to provide character currently being observed. Typically * one, but may be more when combinations of surrogate pairs and unicode escape sequences * are read. */ private int width; /** * Character currently being observed. If a surrogate pair is read then will be the high * member of the pair. */ private char character; /** * Codepoint of character currently being observed. Typically equivalent to the character * but will have a value greater that 0xFFFF when a surrogate pair. */ private int codepoint; /** * true if the last character was a backslash. This is used to handle the special case * when a backslash precedes an unicode escape. In that case, the second backslash * is treated as a backslash and not part of an unicode escape. */ private boolean wasBackslash; /** * true if the last character was derived from an unicode escape sequence. */ private boolean wasUnicodeEscape; /** * Log for error reporting. */ private final Log log; /** * Constructor. * * @param sf scan factory. * @param array array containing contents of source. * @param length length of meaningful content in buffer. */ protected UnicodeReader(ScannerFactory sf, char[] array, int length) { this(sf.log, array, length); } /** * Constructor. * * @param log Log for error reporting. * @param array array containing contents of source. * @param length length of meaningful content in buffer. */ protected UnicodeReader(Log log, char[] array, int length) { this(log, array, 0, 0, length); } /** * Constructor. * * @param log Log for error reporting. * @param array array containing contents of source. * @param pos start of meaningful content in buffer. * @param endPos end of meaningful content in buffer. */ @SuppressWarnings("this-escape") protected UnicodeReader(Log log, char[] array, int offset, int pos, int endPos) { this.buffer = array; this.length = endPos; this.offset = offset; this.position = pos; this.width = 0; this.character = '\0'; this.codepoint = 0; this.wasBackslash = false; this.wasUnicodeEscape = false; this.log = log; nextCodePoint(); } /** * Returns the character buffer. * * @return character buffer. */ protected char[] buffer() { return buffer; } /** * Returns the length of the buffer. This is length of meaningful content in buffer and * not the length of the buffer array. * * @return length of the buffer. */ protected int length() { return length; } /** * Return true if current position is within the meaningful part of the buffer. * * @return true if current position is within the meaningful part of the buffer. */ protected boolean isAvailable() { return position < length; } /** * Fetches the next 16-bit character from the buffer and places it in this.character. */ private void nextCodeUnit() { // Index of next character in buffer. int index = position + width; // If past end of buffer. if (length <= index) { // End of file is marked with EOI. character = EOI; } else { // Next character in buffer. character = buffer[index]; // Increment length of codepoint. width++; } } /** * Fetches the next 16-bit character from the buffer. If an unicode escape * is detected then converts the unicode escape to a character. */ private void nextUnicodeInputCharacter() { // Position to next codepoint. position += width; // Codepoint has no characters yet. width = 0; // Fetch next character. nextCodeUnit(); if (character == '\\' && (!wasBackslash || wasUnicodeEscape)) { // Is a backslash and may be an unicode escape. switch (unicodeEscape()) { case BACKSLASH -> { wasUnicodeEscape = false; wasBackslash = !wasBackslash; } case VALID_ESCAPE -> { wasUnicodeEscape = true; wasBackslash = character == '\\' && !wasBackslash; } case BROKEN_ESCAPE -> nextUnicodeInputCharacter(); //skip broken unicode escapes } } else { wasBackslash = false; wasUnicodeEscape = false; } // Codepoint and character match if not surrogate. codepoint = (int)character; } /** * Fetches the nextcode point from the buffer. If an unicode escape is recognized * then converts unicode escape to a character. If two characters are a surrogate pair * then converts to a codepoint. */ private void nextCodePoint() { // Next unicode character. nextUnicodeInputCharacter(); // Return early if ASCII or not a surrogate pair. if (isASCII() || !Character.isHighSurrogate(character)) { return; } // Capture high surrogate and position. char hi = character; int savePosition = position; int saveWidth = width; // Get potential low surrogate. nextUnicodeInputCharacter(); char lo = character; if (Character.isLowSurrogate(lo)) { // Start codepoint at start of high surrogate. position = savePosition; width += saveWidth; // Compute codepoint. codepoint = Character.toCodePoint(hi, lo); } else { // Restore to treat high surrogate as just a character. position = savePosition; width = saveWidth; character = hi; codepoint = (int)hi; // Could potential report an error here (old code did not.) } } /** * Converts an unicode escape into a character. * * @return true if was an unicode escape. */ private UnicodeEscapeResult unicodeEscape() { // Start of unicode escape (past backslash.) int start = position + width; // Default to backslash result, unless proven otherwise. character = '\\'; width = 1; // Skip multiple 'u'. int index; for (index = start; index < length; index++) { if (buffer[index] != 'u') { break; } } // Needs to have been at least one u. if (index == start) { return UnicodeEscapeResult.BACKSLASH; } int code = 0; for (int i = 0; i < 4; i++) { // Translate and merge digit. int digit = index < length ? Character.digit(buffer[index], 16) : -1; code = code << 4 | digit; // If invalid digit. if (code < 0) { break; } // On to next character. index++; } // Skip digits even if error. width = index - position; // If all digits are good. if (code >= 0) { character = (char)code; return UnicodeEscapeResult.VALID_ESCAPE; } else { log.error(index, Errors.IllegalUnicodeEsc); return UnicodeEscapeResult.BROKEN_ESCAPE; } } private enum UnicodeEscapeResult { BACKSLASH, VALID_ESCAPE, BROKEN_ESCAPE; } /** * Return the virtual position in the character buffer. * * @return virtual position in the character buffer. */ protected int position() { return offset + position; } /** * Reset the reader to the specified virtual position. * Warning: Do not use when previous character was an ASCII or unicode backslash. * @param pos */ protected void reset(int pos) { position = pos - offset; width = 0; wasBackslash = false; wasUnicodeEscape = false; nextCodePoint(); } /** * Return the current character in at the current position. * * @return current character in at the current position. */ protected char get() { return character; } /** * Return the current codepoint in at the current position. * * @return current codepoint in at the current position. */ protected int getCodepoint() { return codepoint; } /** * Returns true if the current codepoint is a surrogate. * * @return true if the current codepoint is a surrogate. */ protected boolean isSurrogate() { return 0xFFFF < codepoint; } /** * Returns true if the current character is ASCII. * * @return true if the current character is ASCII. */ protected boolean isASCII() { return character <= 0x7F; } /** * Advances the current character to the next character. * * @return next character. */ protected char next() { nextCodePoint(); return character; } /** * Compare character. Returns true if a match. * * @param ch character to match. * * @return true if a match. */ protected boolean is(char ch) { return character == ch; } /** * Match one of the arguments. Returns true if a match. */ protected boolean isOneOf(char ch1, char ch2) { return is(ch1) || is(ch2); } protected boolean isOneOf(char ch1, char ch2, char ch3) { return is(ch1) || is(ch2) || is(ch3); } protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4) { return is(ch1) || is(ch2) || is(ch3) || is(ch4); } protected boolean isOneOf(char ch1, char ch2, char ch3, char ch4, char ch5, char ch6) { return is(ch1) || is(ch2) || is(ch3) || is(ch4) || is(ch5) || is(ch6); } /** * Tests to see if current character is in the range of lo to hi characters (inclusive). * * @param lo lowest character in range. * @param hi highest character in range. * * @return true if the current character is in range. */ protected boolean inRange(char lo, char hi) { return lo <= character && character <= hi; } /** * Compare character and advance if a match. Returns true if a match. * * @param ch character to match. * * @return true if a match. */ protected boolean accept(char ch) { if (is(ch)) { next(); return true; } return false; } /** * Match one of the arguments and advance if a match. Returns true if a match. */ protected boolean acceptOneOf(char ch1, char ch2) { if (isOneOf(ch1, ch2)) { next(); return true; } return false; } /** * Match one of the arguments and advance if a match. Returns true if a match. */ protected boolean acceptOneOf(char ch1, char ch2, char ch3) { if (isOneOf(ch1, ch2, ch3)) { next(); return true; } return false; } /** * Return a reader which is bracketed by the currect position * and the next line terminator. * * @return a new reader */ protected UnicodeReader lineReader() { int pos = position; skipToEOLN(); int endPos = position; accept('\r'); accept('\n'); return new UnicodeReader(log, buffer, offset, pos, endPos); } /** * Return a reader which is bracketed by the {@code pos} * and {@code endPos}. * * @param pos initial position * @param endPos end position * * @return a new reader */ protected UnicodeReader lineReader(int pos, int endPos) { return new UnicodeReader(log, buffer, offset, pos - offset, endPos - offset); } /** * Skip over all occurrences of character. * * @param ch character to accept. * * @return number of characters skipped */ protected int skip(char ch) { int count = 0; while (accept(ch)) { count++; } return count; } /** * Is ASCII white space character. * * @return true if is ASCII white space character */ protected boolean isWhitespace() { return isOneOf(' ', '\t', '\f'); } /** * Skip over ASCII white space characters. */ protected void skipWhitespace() { while (acceptOneOf(' ', '\t', '\f')) { // next } } /** * Is ASCII line terminator. * * @return true if is ASCII line terminator. */ protected boolean isEOLN() { return isOneOf('\r', '\n'); } /** * Skip to end of line. */ protected void skipToEOLN() { while (isAvailable()) { if (isEOLN()) { break; } next(); } } /** * Compare string and advance if a match. Returns true if a match. * Warning: Do not use when previous character was a backslash * (confuses state of wasBackslash.) * * @param string string to match character for character. * * @return true if a match. */ protected boolean accept(String string) { // Quick test. if (string.length() == 0 || !is(string.charAt(0))) { return false; } // Be prepared to retreat if not a match. int savedPosition = position(); nextCodePoint(); // Check each character. for (int i = 1; i < string.length(); i++) { if (!is(string.charAt(i))) { // Restart if not a match. reset(savedPosition); return false; } nextCodePoint(); } return true; } /** * Convert an ASCII digit from its base (8, 10, or 16) to its value. Does not * advance character. * * @param pos starting position. * @param digitRadix base of number being converted. * * @return value of digit. */ protected int digit(int pos, int digitRadix) { int result; // Just an ASCII digit. if (inRange('0', '9')) { // Fast common case. result = character - '0'; return result < digitRadix ? result : -1; } // Handle other digits. result = isSurrogate() ? Character.digit(codepoint, digitRadix) : Character.digit(character, digitRadix); if (result >= 0 && !isASCII()) { log.error(position(), Errors.IllegalNonasciiDigit); character = "0123456789abcdef".charAt(result); } return result; } /** * Returns the input buffer. Unicode escape sequences are not translated. * * @return the input buffer. */ public char[] getRawCharacters() { return length == buffer.length ? buffer : Arrays.copyOf(buffer, length); } /** * Returns a copy of a character array subset of the input buffer. * The returned array begins at the {@code beginIndex} and * extends to the character at index {@code endIndex - 1}. * Thus the length of the substring is {@code endIndex-beginIndex}. * This behavior is like * {@code String.substring(beginIndex, endIndex)}. * Unicode escape sequences are not translated. * * @param beginIndex the beginning index, inclusive. * @param endIndex the ending index, exclusive. * * @throws ArrayIndexOutOfBoundsException if either offset is outside of the * array bounds */ public char[] getRawCharacters(int beginIndex, int endIndex) { return Arrays.copyOfRange(buffer, beginIndex, endIndex); } /** * Returns a string subset of the input buffer. * The returned string begins at the {@code beginIndex} and * extends to the character at index {@code endIndex - 1}. * Thus the length of the substring is {@code endIndex-beginIndex}. * This behavior is like * {@code String.substring(beginIndex, endIndex)}. * Unicode escape sequences are not translated. * * @param beginIndex the beginning index, inclusive. * @param endIndex the ending index, exclusive. * * @throws ArrayIndexOutOfBoundsException if either offset is outside of the * array bounds */ public String getRawString(int beginIndex, int endIndex) { return new String(buffer, beginIndex, endIndex - beginIndex); } /** * Returns a string subset of the input buffer. * The returned string begins at the {@code position} and * extends to the character at index {@code length - 1}. * Thus the length of the substring is {@code length-position}. * This behavior is like * {@code String.substring(position, length)}. * Unicode escape sequences are not translated. * * @throws ArrayIndexOutOfBoundsException if either offset is outside of the * array bounds */ public String getRawString() { return getRawString(position, length); } /** * This is a specialized version of UnicodeReader that keeps track of the * column position within a given character stream. Used for Javadoc * processing to build a table for mapping positions in the comment string * to positions in the source file. */ static class PositionTrackingReader extends UnicodeReader { /** * Current column in the comment. */ private int column; /** * Constructor. * * @param reader existing reader * @param pos start of meaningful content in buffer. * @param endPos end of meaningful content in buffer. */ protected PositionTrackingReader(UnicodeReader reader, int pos, int endPos) { super(reader.log, reader.getRawCharacters(pos, endPos), reader.offset + pos, 0, endPos - pos); this.column = 0; } /** * Advances the current character to the next character. Tracks column. * * @return next character. */ @Override protected char next() { super.next(); if (isOneOf('\n', '\r', '\f')) { column = 0; } else if (is('\t')) { column = tabulate(column); } else { column++; } return get(); } /** * Returns the current column. * * @return the current column. */ protected int column() { return column; } } }