/* * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one * or more contributor license agreements. Licensed under the "Elastic License * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side * Public License v 1"; you may not use this file except in compliance with, at * your election, the "Elastic License 2.0", the "GNU Affero General Public * License v3.0 only", or the "Server Side Public License, v 1". */ package org.elasticsearch.lucene.search.uhighlight; import org.elasticsearch.test.ESTestCase; import java.text.BreakIterator; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Locale; import static org.hamcrest.Matchers.greaterThan; import static org.hamcrest.Matchers.greaterThanOrEqualTo; import static org.hamcrest.Matchers.lessThanOrEqualTo; public class BoundedBreakIteratorScannerTests extends ESTestCase { private static final String[] WORD_BOUNDARIES = new String[] { " ", " ", "\t", "#", "\n" }; private static final String[] SENTENCE_BOUNDARIES = new String[] { "! ", "? ", ". ", ".\n", ".\n\n" }; private void testRandomAsciiTextCase(BreakIterator bi, int maxLen) { // Generate a random set of unique terms with ascii character int maxSize = randomIntBetween(5, 100); String[] vocabulary = new String[maxSize]; for (int i = 0; i < maxSize; i++) { if (rarely()) { vocabulary[i] = randomAlphaOfLengthBetween(50, 200); } else { vocabulary[i] = randomAlphaOfLengthBetween(1, 30); } } // Generate a random text made of random terms separated with word-boundaries // and sentence-boundaries. StringBuilder text = new StringBuilder(); List offsetList = new ArrayList<>(); List sizeList = new ArrayList<>(); // the number of sentences to generate int numSentences = randomIntBetween(10, 100); int maxTermLen = 0; for (int i = 0; i < numSentences; i++) { // the number of terms in the sentence int numTerms = randomIntBetween(5, 10); for (int j = 0; j < numTerms; j++) { int termId = randomIntBetween(0, vocabulary.length - 1); String term = vocabulary[termId].toLowerCase(Locale.ROOT); if (j == 0) { // capitalize the first letter of the first term in the sentence term = term.substring(0, 1).toUpperCase(Locale.ROOT) + term.substring(1); } else { String sep = randomFrom(WORD_BOUNDARIES); text.append(sep); } maxTermLen = Math.max(term.length(), maxTermLen); offsetList.add(text.length()); sizeList.add(term.length()); text.append(term); } String boundary = randomFrom(SENTENCE_BOUNDARIES); text.append(boundary); } int[] sizes = sizeList.stream().mapToInt(i -> i).toArray(); int[] offsets = offsetList.stream().mapToInt(i -> i).toArray(); bi.setText(text.toString()); int currentPos = randomIntBetween(0, 20); int lastEnd = -1; int maxPassageLen = maxLen + (maxTermLen * 2); while (currentPos < offsets.length) { // find the passage that contains the current term int nextOffset = offsets[currentPos]; int start = bi.preceding(nextOffset + 1); int end = bi.following(nextOffset); // check that the passage is valid assertThat(start, greaterThanOrEqualTo(lastEnd)); assertThat(end, greaterThan(start)); assertThat(start, lessThanOrEqualTo(nextOffset)); assertThat(end, greaterThanOrEqualTo(nextOffset)); int passageLen = end - start; assertThat(passageLen, lessThanOrEqualTo(maxPassageLen)); // checks that the start and end of the passage are on word boundaries. int startPos = Arrays.binarySearch(offsets, start); int endPos = Arrays.binarySearch(offsets, end); if (startPos < 0) { int lastWordEnd = offsets[Math.abs(startPos) - 2] + sizes[Math.abs(startPos) - 2]; assertThat(start, greaterThanOrEqualTo(lastWordEnd)); } if (endPos < 0) { if (Math.abs(endPos) - 2 < offsets.length) { int lastWordEnd = offsets[Math.abs(endPos) - 2] + sizes[Math.abs(endPos) - 2]; assertThat(end, greaterThanOrEqualTo(lastWordEnd)); } // advance the position to the end of the current passage currentPos = (Math.abs(endPos) - 1); } else { // advance the position to the end of the current passage currentPos = endPos; } // randomly advance to the next term to highlight currentPos += randomIntBetween(0, 20); lastEnd = end; } } public void testBoundedSentence() { for (int i = 0; i < 20; i++) { int maxLen = randomIntBetween(10, 500); testRandomAsciiTextCase(BoundedBreakIteratorScanner.getSentence(Locale.ROOT, maxLen), maxLen); } } public void testTextThatEndsBeforeMaxLen() { BreakIterator bi = BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 1000); final String text = "This is the first test sentence. Here is the second one."; int offset = text.indexOf("first"); bi.setText(text); assertEquals(0, bi.preceding(offset)); assertEquals(text.length(), bi.following(offset - 1)); offset = text.indexOf("second"); bi.setText(text); assertEquals(33, bi.preceding(offset)); assertEquals(text.length(), bi.following(offset - 1)); } public void testFragmentSizeThatIsTooBig() { final int fragmentSize = Integer.MAX_VALUE; BreakIterator bi = BoundedBreakIteratorScanner.getSentence(Locale.ROOT, fragmentSize); final String text = "Any sentence"; final int offset = 0; // find at beggining of text bi.setText(text); assertEquals(0, bi.preceding(offset)); assertEquals(text.length(), bi.following(offset - 1)); } }