alvinalexander.com | career | drupal | java | mac | mysql | perl | scala | uml | unix  

Lucene example source code file (BrazilianStemmer.java)

This example Lucene source code file (BrazilianStemmer.java) is included in the DevDaily.com "Java Source Code Warehouse" project. The intent of this project is to help you "Learn Java by Example" TM.

Java - Lucene tags/keywords

brazilianstemmer, ct, ct, r1, r2, r2, rv, rv, string, string, term

The Lucene BrazilianStemmer.java source code

package org.apache.lucene.analysis.br;

/**
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

/**
 * A stemmer for Brazilian Portuguese words.
 */
public class BrazilianStemmer {

	/**
	 * Changed term
	 */
	private   String TERM ;
	private   String CT ;
	private   String R1 ;
	private   String R2 ;
	private   String RV ;


	public BrazilianStemmer() {
	}

	/**
	 * Stems the given term to an unique <tt>discriminator.
	 *
	 * @param term  The term that should be stemmed.
	 * @return      Discriminator for <tt>term
	 */
	protected String stem( String term ) {
    boolean altered = false ; // altered the term

    // creates CT
    createCT(term) ;

		if ( !isIndexable( CT ) ) {
			return null;
		}
		if ( !isStemmable( CT ) ) {
			return CT ;
		}

    R1 = getR1(CT) ;
    R2 = getR1(R1) ;
    RV = getRV(CT) ;
    TERM = term + ";" +CT ;

    altered = step1() ;
    if (!altered) {
      altered = step2() ;
    }

    if (altered) {
      step3();
    } else {
      step4();
    }

    step5() ;

    return CT ;
	}

	/**
	 * Checks a term if it can be processed correctly.
	 *
	 * @return  true if, and only if, the given term consists in letters.
	 */
	private boolean isStemmable( String term ) {
		for ( int c = 0; c < term.length(); c++ ) {
			// Discard terms that contain non-letter characters.
			if ( !Character.isLetter(term.charAt(c))) {
				return false;
			}
		}
		return true;
	}

	/**
	 * Checks a term if it can be processed indexed.
	 *
	 * @return  true if it can be indexed
	 */
	private boolean isIndexable( String term ) {
		return (term.length() < 30) && (term.length() > 2) ;
	}

	/**
	 * See if string is 'a','e','i','o','u'
   *
   * @return true if is vowel
	 */
	private boolean isVowel( char value ) {
    return (value == 'a') ||
           (value == 'e') ||
           (value == 'i') ||
           (value == 'o') ||
           (value == 'u') ;
  }

	/**
	 * Gets R1
   *
   * R1 - is the region after the first non-vowel following a vowel,
   *      or is the null region at the end of the word if there is
   *      no such non-vowel.
   *
   * @return null or a string representing R1
	 */
	private String getR1( String value ) {
    int     i;
    int     j;

    // be-safe !!!
    if (value == null) {
      return null ;
    }

    // find 1st vowel
    i = value.length()-1 ;
    for (j=0 ; j < i ; j++) {
      if (isVowel(value.charAt(j))) {
        break ;
      }
    }

    if (!(j < i)) {
      return null ;
    }

    // find 1st non-vowel
    for ( ; j < i ; j++) {
      if (!(isVowel(value.charAt(j)))) {
        break ;
      }
    }

    if (!(j < i)) {
      return null ;
    }

    return value.substring(j+1) ;
  }

	/**
	 * Gets RV
   *
   * RV - IF the second letter is a consonant, RV is the region after
   *      the next following vowel,
   *
   *      OR if the first two letters are vowels, RV is the region
   *      after the next consonant,
   *
   *      AND otherwise (consonant-vowel case) RV is the region after
   *      the third letter.
   *
   *      BUT RV is the end of the word if this positions cannot be
   *      found.
   *
   * @return null or a string representing RV
	 */
	private String getRV( String value ) {
    int     i;
    int     j;

    // be-safe !!!
    if (value == null) {
      return null ;
    }

    i = value.length()-1 ;

    // RV - IF the second letter is a consonant, RV is the region after
    //      the next following vowel,
    if ((i > 0) && !isVowel(value.charAt(1))) {
      // find 1st vowel
      for (j=2 ; j < i ; j++) {
        if (isVowel(value.charAt(j))) {
          break ;
        }
      }

      if (j < i) {
        return value.substring(j+1) ;
      }
    }


    // RV - OR if the first two letters are vowels, RV is the region
    //      after the next consonant,
    if ((i > 1) &&
        isVowel(value.charAt(0)) &&
        isVowel(value.charAt(1))) {
      // find 1st consoant
      for (j=2 ; j < i ; j++) {
        if (!isVowel(value.charAt(j))) {
          break ;
        }
      }

      if (j < i) {
        return value.substring(j+1) ;
      }
    }

    // RV - AND otherwise (consonant-vowel case) RV is the region after
    //      the third letter.
    if (i > 2) {
      return value.substring(3) ;
    }

    return null ;
  }

	/**
   * 1) Turn to lowercase
   * 2) Remove accents
   * 3) ã -> a ; õ -> o
   * 4) ç -> c
   *
   * @return null or a string transformed
	 */
	private String changeTerm( String value ) {
    int     j;
    String  r = "" ;

    // be-safe !!!
    if (value == null) {
      return null ;
    }

    value = value.toLowerCase() ;
    for (j=0 ; j < value.length() ; j++) {
      if ((value.charAt(j) == 'á') ||
          (value.charAt(j) == 'â') ||
          (value.charAt(j) == 'ã')) {
        r= r + "a" ; continue ;
      }
      if ((value.charAt(j) == 'é') ||
          (value.charAt(j) == 'ê')) {
        r= r + "e" ; continue ;
      }
      if (value.charAt(j) == 'í') {
        r= r + "i" ; continue ;
      }
      if ((value.charAt(j) == 'ó') ||
          (value.charAt(j) == 'ô') ||
          (value.charAt(j) == 'õ')) {
        r= r + "o" ; continue ;
      }
      if ((value.charAt(j) == 'ú') ||
          (value.charAt(j) == 'ü')) {
        r= r + "u" ; continue ;
      }
      if (value.charAt(j) == 'ç') {
        r= r + "c" ; continue ;
      }
      if (value.charAt(j) == 'ñ') {
        r= r + "n" ; continue ;
      }

      r= r+ value.charAt(j) ;
    }

    return r ;
  }

	/**
   * Check if a string ends with a suffix
   *
   * @return true if the string ends with the specified suffix
	 */
	private boolean suffix( String value, String suffix ) {

    // be-safe !!!
    if ((value == null) || (suffix == null)) {
      return false ;
    }

    if (suffix.length() > value.length()) {
      return false ;
    }

    return value.substring(value.length()-suffix.length()).equals(suffix);
  }

	/**
   * Replace a string suffix by another
   *
   * @return the replaced String
	 */
	private String replaceSuffix( String value, String toReplace, String changeTo ) {
    String vvalue ;

    // be-safe !!!
    if ((value == null) ||
        (toReplace == null) ||
        (changeTo == null) ) {
      return value ;
    }

    vvalue = removeSuffix(value,toReplace) ;

    if (value.equals(vvalue)) {
      return value ;
    } else {
      return vvalue + changeTo ;
    }
  }

	/**
   * Remove a string suffix
   *
   * @return the String without the suffix
	 */
	private String removeSuffix( String value, String toRemove ) {
    // be-safe !!!
    if ((value == null) ||
        (toRemove == null) ||
        !suffix(value,toRemove) ) {
      return value ;
    }

    return value.substring(0,value.length()-toRemove.length()) ;
  }

	/**
   * See if a suffix is preceded by a String
   *
   * @return true if the suffix is preceded
	 */
	private boolean suffixPreceded( String value, String suffix, String preceded ) {
    // be-safe !!!
    if ((value == null) ||
        (suffix == null) ||
        (preceded == null) ||
        !suffix(value,suffix) ) {
      return false ;
    }

    return suffix(removeSuffix(value,suffix),preceded) ;
  }

	/**
	 * Creates CT (changed term) , substituting * 'ã' and 'õ' for 'a~' and 'o~'.
	 */
	private void createCT( String term ) {
    CT = changeTerm(term) ;

    if (CT.length() < 2) return ;

    // if the first character is ... , remove it
    if ((CT.charAt(0) == '"')  ||
        (CT.charAt(0) == '\'') ||
        (CT.charAt(0) == '-')  ||
        (CT.charAt(0) == ',')  ||
        (CT.charAt(0) == ';')  ||
        (CT.charAt(0) == '.')  ||
        (CT.charAt(0) == '?')  ||
        (CT.charAt(0) == '!')
        ) {
        CT = CT.substring(1);
    }

    if (CT.length() < 2) return ;

    // if the last character is ... , remove it
    if ((CT.charAt(CT.length()-1) == '-') ||
        (CT.charAt(CT.length()-1) == ',') ||
        (CT.charAt(CT.length()-1) == ';') ||
        (CT.charAt(CT.length()-1) == '.') ||
        (CT.charAt(CT.length()-1) == '?') ||
        (CT.charAt(CT.length()-1) == '!') ||
        (CT.charAt(CT.length()-1) == '\'') ||
        (CT.charAt(CT.length()-1) == '"')
        ) {
        CT = CT.substring(0,CT.length()-1);
    }
  }


	/**
	 * Standard suffix removal.
   * Search for the longest among the following suffixes, and perform
   * the following actions:
   *
   * @return false if no ending was removed
	 */
	private boolean step1() {
    if (CT == null) return false ;

    // suffix length = 7
    if (suffix(CT,"uciones") && suffix(R2,"uciones")) {
        CT = replaceSuffix(CT,"uciones","u") ; return true;
    }

    // suffix length = 6
    if (CT.length() >= 6) {
      if (suffix(CT,"imentos") && suffix(R2,"imentos")) {
          CT = removeSuffix(CT,"imentos") ; return true;
      }
      if (suffix(CT,"amentos") && suffix(R2,"amentos")) {
          CT = removeSuffix(CT,"amentos") ; return true;
      }
      if (suffix(CT,"adores") && suffix(R2,"adores")) {
          CT = removeSuffix(CT,"adores") ; return true;
      }
      if (suffix(CT,"adoras") && suffix(R2,"adoras")) {
          CT = removeSuffix(CT,"adoras") ; return true;
      }
      if (suffix(CT,"logias") && suffix(R2,"logias")) {
          replaceSuffix(CT,"logias","log") ; return true;
      }
      if (suffix(CT,"encias") && suffix(R2,"encias")) {
          CT = replaceSuffix(CT,"encias","ente") ; return true;
      }
      if (suffix(CT,"amente") && suffix(R1,"amente")) {
          CT = removeSuffix(CT,"amente") ; return true;
      }
      if (suffix(CT,"idades") && suffix(R2,"idades")) {
          CT = removeSuffix(CT,"idades") ; return true;
      }
    }

    // suffix length = 5
    if (CT.length() >= 5) {
      if (suffix(CT,"acoes") && suffix(R2,"acoes")) {
          CT = removeSuffix(CT,"acoes") ; return true;
      }
      if (suffix(CT,"imento") && suffix(R2,"imento")) {
          CT = removeSuffix(CT,"imento") ; return true;
      }
      if (suffix(CT,"amento") && suffix(R2,"amento")) {
          CT = removeSuffix(CT,"amento") ; return true;
      }
      if (suffix(CT,"adora") && suffix(R2,"adora")) {
          CT = removeSuffix(CT,"adora") ; return true;
      }
      if (suffix(CT,"ismos") && suffix(R2,"ismos")) {
          CT = removeSuffix(CT,"ismos") ; return true;
      }
      if (suffix(CT,"istas") && suffix(R2,"istas")) {
          CT = removeSuffix(CT,"istas") ; return true;
      }
      if (suffix(CT,"logia") && suffix(R2,"logia")) {
          CT = replaceSuffix(CT,"logia","log") ; return true;
      }
      if (suffix(CT,"ucion") && suffix(R2,"ucion")) {
          CT = replaceSuffix(CT,"ucion","u") ; return true;
      }
      if (suffix(CT,"encia") && suffix(R2,"encia")) {
          CT = replaceSuffix(CT,"encia","ente") ; return true;
      }
      if (suffix(CT,"mente") && suffix(R2,"mente")) {
          CT = removeSuffix(CT,"mente") ; return true;
      }
      if (suffix(CT,"idade") && suffix(R2,"idade")) {
          CT = removeSuffix(CT,"idade") ; return true;
      }
    }

    // suffix length = 4
    if (CT.length() >= 4) {
      if (suffix(CT,"acao") && suffix(R2,"acao")) {
          CT = removeSuffix(CT,"acao") ; return true;
      }
      if (suffix(CT,"ezas") && suffix(R2,"ezas")) {
          CT = removeSuffix(CT,"ezas") ; return true;
      }
      if (suffix(CT,"icos") && suffix(R2,"icos")) {
          CT = removeSuffix(CT,"icos") ; return true ;
      }
      if (suffix(CT,"icas") && suffix(R2,"icas")) {
          CT = removeSuffix(CT,"icas") ; return true ;
      }
      if (suffix(CT,"ismo") && suffix(R2,"ismo")) {
          CT = removeSuffix(CT,"ismo") ; return true ;
      }
      if (suffix(CT,"avel") && suffix(R2,"avel")) {
          CT = removeSuffix(CT,"avel") ; return true ;
      }
      if (suffix(CT,"ivel") && suffix(R2,"ivel")) {
          CT = removeSuffix(CT,"ivel") ; return true ;
      }
      if (suffix(CT,"ista") && suffix(R2,"ista")) {
          CT = removeSuffix(CT,"ista") ; return true ;
      }
      if (suffix(CT,"osos") && suffix(R2,"osos")) {
          CT = removeSuffix(CT,"osos") ; return true ;
      }
      if (suffix(CT,"osas") && suffix(R2,"osas")) {
          CT = removeSuffix(CT,"osas") ; return true ;
      }
      if (suffix(CT,"ador") && suffix(R2,"ador")) {
          CT = removeSuffix(CT,"ador") ; return true ;
      }
      if (suffix(CT,"ivas") && suffix(R2,"ivas")) {
          CT = removeSuffix(CT,"ivas") ; return true ;
      }
      if (suffix(CT,"ivos") && suffix(R2,"ivos")) {
          CT = removeSuffix(CT,"ivos") ; return true ;
      }
      if (suffix(CT,"iras") &&
          suffix(RV,"iras") &&
          suffixPreceded(CT,"iras","e")) {
          CT = replaceSuffix(CT,"iras","ir") ; return true ;
      }
    }

    // suffix length = 3
    if (CT.length() >= 3) {
      if (suffix(CT,"eza") && suffix(R2,"eza")) {
          CT = removeSuffix(CT,"eza") ; return true ;
      }
      if (suffix(CT,"ico") && suffix(R2,"ico")) {
          CT = removeSuffix(CT,"ico") ; return true ;
      }
      if (suffix(CT,"ica") && suffix(R2,"ica")) {
          CT = removeSuffix(CT,"ica") ; return true ;
      }
      if (suffix(CT,"oso") && suffix(R2,"oso")) {
          CT = removeSuffix(CT,"oso") ; return true ;
      }
      if (suffix(CT,"osa") && suffix(R2,"osa")) {
          CT = removeSuffix(CT,"osa") ; return true ;
      }
      if (suffix(CT,"iva") && suffix(R2,"iva")) {
          CT = removeSuffix(CT,"iva") ; return true ;
      }
      if (suffix(CT,"ivo") && suffix(R2,"ivo")) {
          CT = removeSuffix(CT,"ivo") ; return true ;
      }
      if (suffix(CT,"ira") &&
          suffix(RV,"ira") &&
          suffixPreceded(CT,"ira","e")) {
          CT = replaceSuffix(CT,"ira","ir") ; return true ;
      }
    }

    // no ending was removed by step1
    return false ;
  }


	/**
	 * Verb suffixes.
   *
   * Search for the longest among the following suffixes in RV,
   * and if found, delete.
   *
   * @return false if no ending was removed
	*/
	private boolean step2() {
    if (RV == null) return false ;

    // suffix lenght = 7
    if (RV.length() >= 7) {
      if (suffix(RV,"issemos")) {
        CT = removeSuffix(CT,"issemos") ; return true;
      }
      if (suffix(RV,"essemos")) {
        CT = removeSuffix(CT,"essemos") ; return true;
      }
      if (suffix(RV,"assemos")) {
        CT = removeSuffix(CT,"assemos") ; return true;
      }
      if (suffix(RV,"ariamos")) {
        CT = removeSuffix(CT,"ariamos") ; return true;
      }
      if (suffix(RV,"eriamos")) {
        CT = removeSuffix(CT,"eriamos") ; return true;
      }
      if (suffix(RV,"iriamos")) {
        CT = removeSuffix(CT,"iriamos") ; return true;
      }
    }

    // suffix length = 6
    if (RV.length() >= 6) {
      if (suffix(RV,"iremos")) {
        CT = removeSuffix(CT,"iremos") ; return true;
      }
      if (suffix(RV,"eremos")) {
        CT = removeSuffix(CT,"eremos") ; return true;
      }
      if (suffix(RV,"aremos")) {
        CT = removeSuffix(CT,"aremos") ; return true;
      }
      if (suffix(RV,"avamos")) {
        CT = removeSuffix(CT,"avamos") ; return true;
      }
      if (suffix(RV,"iramos")) {
        CT = removeSuffix(CT,"iramos") ; return true;
      }
      if (suffix(RV,"eramos")) {
        CT = removeSuffix(CT,"eramos") ; return true;
      }
      if (suffix(RV,"aramos")) {
        CT = removeSuffix(CT,"aramos") ; return true;
      }
      if (suffix(RV,"asseis")) {
        CT = removeSuffix(CT,"asseis") ; return true;
      }
      if (suffix(RV,"esseis")) {
        CT = removeSuffix(CT,"esseis") ; return true;
      }
      if (suffix(RV,"isseis")) {
        CT = removeSuffix(CT,"isseis") ; return true;
      }
      if (suffix(RV,"arieis")) {
        CT = removeSuffix(CT,"arieis") ; return true;
      }
      if (suffix(RV,"erieis")) {
        CT = removeSuffix(CT,"erieis") ; return true;
      }
      if (suffix(RV,"irieis")) {
        CT = removeSuffix(CT,"irieis") ; return true;
      }
    }


    // suffix length = 5
    if (RV.length() >= 5) {
      if (suffix(RV,"irmos")) {
        CT = removeSuffix(CT,"irmos") ; return true;
      }
      if (suffix(RV,"iamos")) {
        CT = removeSuffix(CT,"iamos") ; return true;
      }
      if (suffix(RV,"armos")) {
        CT = removeSuffix(CT,"armos") ; return true;
      }
      if (suffix(RV,"ermos")) {
        CT = removeSuffix(CT,"ermos") ; return true;
      }
      if (suffix(RV,"areis")) {
        CT = removeSuffix(CT,"areis") ; return true;
      }
      if (suffix(RV,"ereis")) {
        CT = removeSuffix(CT,"ereis") ; return true;
      }
      if (suffix(RV,"ireis")) {
        CT = removeSuffix(CT,"ireis") ; return true;
      }
      if (suffix(RV,"asses")) {
        CT = removeSuffix(CT,"asses") ; return true;
      }
      if (suffix(RV,"esses")) {
        CT = removeSuffix(CT,"esses") ; return true;
      }
      if (suffix(RV,"isses")) {
        CT = removeSuffix(CT,"isses") ; return true;
      }
      if (suffix(RV,"astes")) {
        CT = removeSuffix(CT,"astes") ; return true;
      }
      if (suffix(RV,"assem")) {
        CT = removeSuffix(CT,"assem") ; return true;
      }
      if (suffix(RV,"essem")) {
        CT = removeSuffix(CT,"essem") ; return true;
      }
      if (suffix(RV,"issem")) {
        CT = removeSuffix(CT,"issem") ; return true;
      }
      if (suffix(RV,"ardes")) {
        CT = removeSuffix(CT,"ardes") ; return true;
      }
      if (suffix(RV,"erdes")) {
        CT = removeSuffix(CT,"erdes") ; return true;
      }
      if (suffix(RV,"irdes")) {
        CT = removeSuffix(CT,"irdes") ; return true;
      }
      if (suffix(RV,"ariam")) {
        CT = removeSuffix(CT,"ariam") ; return true;
      }
      if (suffix(RV,"eriam")) {
        CT = removeSuffix(CT,"eriam") ; return true;
      }
      if (suffix(RV,"iriam")) {
        CT = removeSuffix(CT,"iriam") ; return true;
      }
      if (suffix(RV,"arias")) {
        CT = removeSuffix(CT,"arias") ; return true;
      }
      if (suffix(RV,"erias")) {
        CT = removeSuffix(CT,"erias") ; return true;
      }
      if (suffix(RV,"irias")) {
        CT = removeSuffix(CT,"irias") ; return true;
      }
      if (suffix(RV,"estes")) {
        CT = removeSuffix(CT,"estes") ; return true;
      }
      if (suffix(RV,"istes")) {
        CT = removeSuffix(CT,"istes") ; return true;
      }
      if (suffix(RV,"areis")) {
        CT = removeSuffix(CT,"areis") ; return true;
      }
      if (suffix(RV,"aveis")) {
        CT = removeSuffix(CT,"aveis") ; return true;
      }
    }

    // suffix length = 4
    if (RV.length() >= 4) {
      if (suffix(RV,"aria")) {
        CT = removeSuffix(CT,"aria") ; return true;
      }
      if (suffix(RV,"eria")) {
        CT = removeSuffix(CT,"eria") ; return true;
      }
      if (suffix(RV,"iria")) {
        CT = removeSuffix(CT,"iria") ; return true;
      }
      if (suffix(RV,"asse")) {
        CT = removeSuffix(CT,"asse") ; return true;
      }
      if (suffix(RV,"esse")) {
        CT = removeSuffix(CT,"esse") ; return true;
      }
      if (suffix(RV,"isse")) {
        CT = removeSuffix(CT,"isse") ; return true;
      }
      if (suffix(RV,"aste")) {
        CT = removeSuffix(CT,"aste") ; return true;
      }
      if (suffix(RV,"este")) {
        CT = removeSuffix(CT,"este") ; return true;
      }
      if (suffix(RV,"iste")) {
        CT = removeSuffix(CT,"iste") ; return true;
      }
      if (suffix(RV,"arei")) {
        CT = removeSuffix(CT,"arei") ; return true;
      }
      if (suffix(RV,"erei")) {
        CT = removeSuffix(CT,"erei") ; return true;
      }
      if (suffix(RV,"irei")) {
        CT = removeSuffix(CT,"irei") ; return true;
      }
      if (suffix(RV,"aram")) {
        CT = removeSuffix(CT,"aram") ; return true;
      }
      if (suffix(RV,"eram")) {
        CT = removeSuffix(CT,"eram") ; return true;
      }
      if (suffix(RV,"iram")) {
        CT = removeSuffix(CT,"iram") ; return true;
      }
      if (suffix(RV,"avam")) {
        CT = removeSuffix(CT,"avam") ; return true;
      }
      if (suffix(RV,"arem")) {
        CT = removeSuffix(CT,"arem") ; return true;
      }
      if (suffix(RV,"erem")) {
        CT = removeSuffix(CT,"erem") ; return true;
      }
      if (suffix(RV,"irem")) {
        CT = removeSuffix(CT,"irem") ; return true;
      }
      if (suffix(RV,"ando")) {
        CT = removeSuffix(CT,"ando") ; return true;
      }
      if (suffix(RV,"endo")) {
        CT = removeSuffix(CT,"endo") ; return true;
      }
      if (suffix(RV,"indo")) {
        CT = removeSuffix(CT,"indo") ; return true;
      }
      if (suffix(RV,"arao")) {
        CT = removeSuffix(CT,"arao") ; return true;
      }
      if (suffix(RV,"erao")) {
        CT = removeSuffix(CT,"erao") ; return true;
      }
      if (suffix(RV,"irao")) {
        CT = removeSuffix(CT,"irao") ; return true;
      }
      if (suffix(RV,"adas")) {
        CT = removeSuffix(CT,"adas") ; return true;
      }
      if (suffix(RV,"idas")) {
        CT = removeSuffix(CT,"idas") ; return true;
      }
      if (suffix(RV,"aras")) {
        CT = removeSuffix(CT,"aras") ; return true;
      }
      if (suffix(RV,"eras")) {
        CT = removeSuffix(CT,"eras") ; return true;
      }
      if (suffix(RV,"iras")) {
        CT = removeSuffix(CT,"iras") ; return true;
      }
      if (suffix(RV,"avas")) {
        CT = removeSuffix(CT,"avas") ; return true;
      }
      if (suffix(RV,"ares")) {
        CT = removeSuffix(CT,"ares") ; return true;
      }
      if (suffix(RV,"eres")) {
        CT = removeSuffix(CT,"eres") ; return true;
      }
      if (suffix(RV,"ires")) {
        CT = removeSuffix(CT,"ires") ; return true;
      }
      if (suffix(RV,"ados")) {
        CT = removeSuffix(CT,"ados") ; return true;
      }
      if (suffix(RV,"idos")) {
        CT = removeSuffix(CT,"idos") ; return true;
      }
      if (suffix(RV,"amos")) {
        CT = removeSuffix(CT,"amos") ; return true;
      }
      if (suffix(RV,"emos")) {
        CT = removeSuffix(CT,"emos") ; return true;
      }
      if (suffix(RV,"imos")) {
        CT = removeSuffix(CT,"imos") ; return true;
      }
      if (suffix(RV,"iras")) {
        CT = removeSuffix(CT,"iras") ; return true;
      }
      if (suffix(RV,"ieis")) {
        CT = removeSuffix(CT,"ieis") ; return true;
      }
    }

    // suffix length = 3
    if (RV.length() >= 3) {
      if (suffix(RV,"ada")) {
        CT = removeSuffix(CT,"ada") ; return true;
      }
      if (suffix(RV,"ida")) {
        CT = removeSuffix(CT,"ida") ; return true;
      }
      if (suffix(RV,"ara")) {
        CT = removeSuffix(CT,"ara") ; return true;
      }
      if (suffix(RV,"era")) {
        CT = removeSuffix(CT,"era") ; return true;
      }
      if (suffix(RV,"ira")) {
        CT = removeSuffix(CT,"ava") ; return true;
      }
      if (suffix(RV,"iam")) {
        CT = removeSuffix(CT,"iam") ; return true;
      }
      if (suffix(RV,"ado")) {
        CT = removeSuffix(CT,"ado") ; return true;
      }
      if (suffix(RV,"ido")) {
        CT = removeSuffix(CT,"ido") ; return true;
      }
      if (suffix(RV,"ias")) {
        CT = removeSuffix(CT,"ias") ; return true;
      }
      if (suffix(RV,"ais")) {
        CT = removeSuffix(CT,"ais") ; return true;
      }
      if (suffix(RV,"eis")) {
        CT = removeSuffix(CT,"eis") ; return true;
      }
      if (suffix(RV,"ira")) {
        CT = removeSuffix(CT,"ira") ; return true;
      }
      if (suffix(RV,"ear")) {
        CT = removeSuffix(CT,"ear") ; return true;
      }
    }

    // suffix length = 2
    if (RV.length() >= 2) {
      if (suffix(RV,"ia")) {
        CT = removeSuffix(CT,"ia") ; return true;
      }
      if (suffix(RV,"ei")) {
        CT = removeSuffix(CT,"ei") ; return true;
      }
      if (suffix(RV,"am")) {
        CT = removeSuffix(CT,"am") ; return true;
      }
      if (suffix(RV,"em")) {
        CT = removeSuffix(CT,"em") ; return true;
      }
      if (suffix(RV,"ar")) {
        CT = removeSuffix(CT,"ar") ; return true;
      }
      if (suffix(RV,"er")) {
        CT = removeSuffix(CT,"er") ; return true;
      }
      if (suffix(RV,"ir")) {
        CT = removeSuffix(CT,"ir") ; return true;
      }
      if (suffix(RV,"as")) {
        CT = removeSuffix(CT,"as") ; return true;
      }
      if (suffix(RV,"es")) {
        CT = removeSuffix(CT,"es") ; return true;
      }
      if (suffix(RV,"is")) {
        CT = removeSuffix(CT,"is") ; return true;
      }
      if (suffix(RV,"eu")) {
        CT = removeSuffix(CT,"eu") ; return true;
      }
      if (suffix(RV,"iu")) {
        CT = removeSuffix(CT,"iu") ; return true;
      }
      if (suffix(RV,"iu")) {
        CT = removeSuffix(CT,"iu") ; return true;
      }
      if (suffix(RV,"ou")) {
        CT = removeSuffix(CT,"ou") ; return true;
      }
    }

    // no ending was removed by step2
    return false ;
  }

	/**
	 * Delete suffix 'i' if in RV and preceded by 'c'
   *
	*/
	private void step3() {
    if (RV == null) return ;

    if (suffix(RV,"i") && suffixPreceded(RV,"i","c")) {
      CT = removeSuffix(CT,"i") ;
    }

  }

	/**
	 * Residual suffix
   *
   * If the word ends with one of the suffixes (os a i o á í ó)
   * in RV, delete it
   *
	*/
	private void step4() {
    if (RV == null) return  ;

    if (suffix(RV,"os")) {
      CT = removeSuffix(CT,"os") ; return ;
    }
    if (suffix(RV,"a")) {
      CT = removeSuffix(CT,"a") ; return ;
    }
    if (suffix(RV,"i")) {
      CT = removeSuffix(CT,"i") ; return ;
    }
    if (suffix(RV,"o")) {
      CT = removeSuffix(CT,"o") ; return ;
    }

  }

	/**
	 * If the word ends with one of ( e é ê) in RV,delete it,
   * and if preceded by 'gu' (or 'ci') with the 'u' (or 'i') in RV,
   * delete the 'u' (or 'i')
   *
   * Or if the word ends ç remove the cedilha
   *
	*/
	private void step5() {
    if (RV == null) return  ;

    if (suffix(RV,"e")) {
      if (suffixPreceded(RV,"e","gu")) {
        CT = removeSuffix(CT,"e") ;
        CT = removeSuffix(CT,"u") ;
        return ;
      }

      if (suffixPreceded(RV,"e","ci")) {
        CT = removeSuffix(CT,"e") ;
        CT = removeSuffix(CT,"i") ;
        return ;
      }

      CT = removeSuffix(CT,"e") ; return ;
    }
  }

	/**
	 * For log and debug purpose
	 *
	 * @return  TERM, CT, RV, R1 and R2
	 */
	public String log() {
    return " (TERM = " + TERM + ")" +
           " (CT = " + CT +")" +
           " (RV = " + RV +")" +
           " (R1 = " + R1 +")" +
           " (R2 = " + R2 +")" ;
	}

}

Other Lucene examples (source code examples)

Here is a short list of links related to this Lucene BrazilianStemmer.java source code file:

... this post is sponsored by my books ...

#1 New Release!

FP Best Seller

 

new blog posts

 

Copyright 1998-2021 Alvin Alexander, alvinalexander.com
All Rights Reserved.

A percentage of advertising revenue from
pages under the /java/jwarehouse URI on this website is
paid back to open source projects.