View Javadoc

1   /***
2    *     Aedict - an EDICT browser for Android
3   Copyright (C) 2009 Martin Vysny
4   
5   This program is free software: you can redistribute it and/or modify
6   it under the terms of the GNU General Public License as published by
7   the Free Software Foundation, either version 3 of the License, or
8   (at your option) any later version.
9   
10  This program is distributed in the hope that it will be useful,
11  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  GNU General Public License for more details.
14  
15  You should have received a copy of the GNU General Public License
16  along with this program.  If not, see <http://www.gnu.org/licenses/>.
17   */
18  package sk.baka.aedict.kanji;
19  
20  import java.util.ArrayList;
21  import java.util.HashSet;
22  import java.util.List;
23  import java.util.Set;
24  
25  import sk.baka.aedict.dict.DictTypeEnum;
26  import sk.baka.aedict.dict.MatcherEnum;
27  import sk.baka.aedict.dict.SearchQuery;
28  import sk.baka.aedict.kanji.Deinflections.Deinflection;
29  import sk.baka.aedict.kanji.VerbInflection.Form;
30  
31  /***
32   * Performs a simple verb deinflection.
33   * 
34   * @author Martin Vysny
35   */
36  public final class VerbDeinflection {
37  
38      private static class IrregularDeinflector extends EndsWithDeinflector {
39  
40          public IrregularDeinflector(final String inflected, final Form form, final String... base) {
41              super(inflected, true, true, form, base);
42          }
43      }
44  
45      private static class EndsWithDeinflector extends AbstractDeinflector {
46  
47          private final String endsWith;
48          private final String[] replaceBy;
49          private final boolean isAllowEntireWordMatch;
50          private final boolean isStopIfMatch;
51          private final Form form;
52  
53          /***
54           * Deinflects a verb if it ends with one of the following strings.
55           *
56           * @param endsWith
57           *            a non-empty non-null list of possible endings, lower-case
58           *            trimmed romaji.
59           *            @param form the form of the verb.
60           * @param replaceBy
61           *            the ending is replaced by this string.
62           */
63          public EndsWithDeinflector(final String endsWith, final Form form, final String... replaceBy) {
64              this(endsWith, false, false, form, replaceBy);
65          }
66  
67          /***
68           * Deinflects a verb if it ends with one of the following strings.
69           *
70           * @param endsWith
71           *            a non-empty non-null list of possible endings, lower-case
72           *            trimmed romaji.
73           * @param isAllowEntireWordMatch
74           *            if true then an entire word must match, if false then only
75           *            a suffix (not an entire word) must match. This is often
76           *            not wanted, e.g. the itai suffix would match the itai
77           *            word.
78           * @param isStopIfMatch
79           *            defines the return value of {@link #stopIfMatch()}.
80           *            @param form the form of the verb.
81           * @param replaceBy
82           *            the ending is replaced by this string.
83           */
84          public EndsWithDeinflector(final String endsWith, final boolean isAllowEntireWordMatch, final boolean isStopIfMatch, final Form form, final String... replaceBy) {
85              this.endsWith = endsWith;
86              this.replaceBy = replaceBy;
87              this.isAllowEntireWordMatch = isAllowEntireWordMatch;
88              this.isStopIfMatch = isStopIfMatch;
89              this.form = form;
90          }
91  
92          @Override
93          public Set<String> deinflect(String romaji) {
94              final String ending = isMatch(romaji);
95              if (ending == null) {
96                  // nothing matched
97                  return null;
98              }
99              final Set<String> result = new HashSet<String>(replaceBy.length);
100             final String verbPart = romaji.substring(0, romaji.length() - ending.length());
101             for (final String rb : replaceBy) {
102                 result.add(verbPart + rb);
103             }
104             return result;
105         }
106 
107         private String isMatch(final String romaji) {
108             final String ending = endsWith;
109                 if (isAllowEntireWordMatch && romaji.equals(ending)) {
110                     return ending;
111                 }
112                 if (!isAllowEntireWordMatch) {
113                     if (romaji.endsWith(ending) && !romaji.equals(ending)) {
114                         return ending;
115                     }
116                 }
117             return null;
118         }
119 
120         @Override
121         public boolean stopIfMatch() {
122             return isStopIfMatch;
123         }
124 
125 		@Override
126 		public Form getForm() {
127 			return form;
128 		}
129     }
130 
131     private static class EruDeinflector extends AbstractDeinflector {
132         // this rule is also required, to correctly deinflect e.g.
133         // aetai. list as a last rule. Make the rule produce the old verb and
134         // also the deinflected one.
135 
136         private final AbstractDeinflector eruDeinflector = new EndsWithDeinflector("eru", Form.ABLE_TO_DO2, "eru", "u");
137 
138         @Override
139         public Set<String> deinflect(String romaji) {
140             // do not deinflect -rareru
141             if (romaji.endsWith("rareru")) {
142                 return null;
143             }
144             return eruDeinflector.deinflect(romaji);
145         }
146 
147         @Override
148         public boolean stopIfMatch() {
149             // if the -eru is deinflected, there is nothing more to match
150             return true;
151         }
152 
153 		@Override
154 		public Form getForm() {
155 			return Form.ABLE_TO_DO2;
156 		}
157     }
158 
159     private static abstract class AbstractDeinflector {
160 
161         /***
162          * Tries to deinflect a verb.
163          *
164          * @param romaji
165          *            a verb in lower-case, trimmed romaji.
166          * @return deinflected verb, or a multiple verbs if there are multiple
167          *         possibilities to deinflect. If this rule cannot be applied to
168          *         deinflect the verb, null or an empty array should be
169          *         returned.
170          */
171         public abstract Set<String> deinflect(String romaji);
172 
173         /***
174          * If true then there is nothing more to deinflect and the process can
175          * be safely stopped.
176          *
177          * @return true if there is nothing more to deinflect, false if the
178          *         deinflection should continue.
179          */
180         public abstract boolean stopIfMatch();
181         
182         /***
183          * Returns the originating form.
184          * @return originating form.
185          */
186         public abstract Form getForm();
187     }
188 
189     private static AbstractDeinflector basicSuffix(final String endsWith, final Form form, final String... replaceBy) {
190         return new EndsWithDeinflector(endsWith, false, true, form, replaceBy);
191     }
192 
193     private static List<IrregularDeinflector> irregular(String[] endsWith, final Form form, final String replaceBy) {
194     	final List<IrregularDeinflector> result=new ArrayList<VerbDeinflection.IrregularDeinflector>();
195     	for(final String ew:endsWith){
196     		result.add(new IrregularDeinflector(ew, form, replaceBy));
197     	}
198     	return result;
199     }
200     
201     private final static List<? extends AbstractDeinflector> DEINFLECTORS;
202 
203     static {
204         final List<AbstractDeinflector> d = new ArrayList<AbstractDeinflector>();
205         d.addAll(irregular(new String[]{"dewaarimasen", "dehaarimasen", "de wa arimasen", "de ha arimasen", "zya arimasen", "zyaarimasen"}, Form.POLITE_NEGATIVE, "desu"));
206         d.addAll(irregular(new String[]{"dewaarimasendesita", "dehaarimasendesita", "de wa arimasen desita", "de ha arimasen desita", "zya arimasen desita", "zyaarimasendesita"}, Form.POLITE_PAST_NEGATIVE, "desu"));
207         // the -masu deinflector
208         d.add(new EndsWithDeinflector("masen", Form.POLITE_NEGATIVE, "masu"));
209         d.add(new EndsWithDeinflector("masita", Form.POLITE_PAST, "masu"));
210         d.add(new EndsWithDeinflector("masendesita", Form.POLITE_PAST_NEGATIVE, "masu"));
211         d.add(new EndsWithDeinflector("masen desita", Form.POLITE_PAST_NEGATIVE, "masu"));
212         // the -nakatta deinflector
213         d.add(new EndsWithDeinflector("nakatta", Form.NEGATIVE_PAST, "nai"));
214         // irregulars deinflector
215         d.add(new IrregularDeinflector("sinai", Form.NEGATIVE, "suru"));
216         d.add(new IrregularDeinflector("sita", Form.PAST_TENSE, "suru"));
217         d.add(new IrregularDeinflector("site", Form.CONTINUATION , "suru"));
218         d.add(new IrregularDeinflector("simasu", Form.POLITE, "suru"));
219         d.add(new IrregularDeinflector("siyou", Form.LET_S2, "suru"));
220         d.add(new IrregularDeinflector("sareru", Form.PLAIN, "sareru"));
221         d.add(new IrregularDeinflector("sarenai", Form.NEGATIVE, "sareru"));
222         d.add(new IrregularDeinflector("sareta", Form.PAST_TENSE, "sareru"));
223         d.add(new IrregularDeinflector("konai", Form.NEGATIVE, "kuru"));
224         d.add(new IrregularDeinflector("kita",Form.PAST_TENSE, "kuru"));
225         d.add(new IrregularDeinflector("kite", Form.CONTINUATION, "kuru"));
226         d.add(new IrregularDeinflector("kimasu", Form.POLITE, "kuru"));
227         d.add(new IrregularDeinflector("koyou",Form.LET_S2, "kuru"));
228         d.add(new IrregularDeinflector("da", Form.PLAIN, "desu"));
229         d.addAll(irregular(new String[]{"dewanai", "zyanai"},Form.NEGATIVE, "desu"));
230         d.add(new IrregularDeinflector("datta", Form.PAST_TENSE , "desu"));
231         d.add(new IrregularDeinflector("desita", Form.POLITE_PAST, "desu"));
232         d.add(new IrregularDeinflector("de", null, "desu"));
233         d.addAll(irregular(new String[]{"dehanai", "de ha nai", "dewanai", "de wa nai"}, Form.NEGATIVE, "desu"));
234         d.addAll(irregular(new String[]{"dehaaru", "de ha aru", "de wa aru", "dewaaru"}, Form.PLAIN, "desu"));
235         d.add(new IrregularDeinflector("itta", Form.PAST_TENSE, "iku"));
236         d.add(new IrregularDeinflector("itte", Form.CONTINUATION, "iku"));
237         d.add(new IrregularDeinflector("ikimasu", Form.POLITE, "iku"));
238         d.add(new IrregularDeinflector("ikareru", Form.PLAIN, "ikareru"));
239         d.add(new IrregularDeinflector("ikarenai", Form.NEGATIVE, "ikareru"));
240         d.add(new IrregularDeinflector("ikareta",Form.PAST_TENSE, "ikareru"));
241         d.add(new IrregularDeinflector("nai", Form.NEGATIVE, "aru"));
242         d.add(new IrregularDeinflector("nakatta", Form.NEGATIVE_PAST, "aru"));
243         d.add(new IrregularDeinflector("arimasu",Form.POLITE, "aru"));
244         d.add(new IrregularDeinflector("atte", Form.CONTINUATION, "aru", "au"));
245         d.add(new IrregularDeinflector("atta", Form.PAST_TENSE, "aru", "au"));
246         // regular inflections
247         d.add(basicSuffix("kereba", Form.NEGATIVE_CONDITIONAL, "i"));
248         d.add(basicSuffix("arenai", Form.NEGATIVE, "areru"));
249         d.add(basicSuffix("areta", Form.PAST_TENSE, "areru"));
250         d.add(basicSuffix("areru", Form.PLAIN, "areru"));
251         d.add(basicSuffix("wanai", Form.NEGATIVE, "u"));
252         d.add(basicSuffix("anai", Form.NEGATIVE, "u"));
253         // further deinflect -eru
254         d.add(new EndsWithDeinflector("enai", Form.NEGATIVE, "eru"));
255         // e.g. minai -> miru
256         d.add(basicSuffix("inai", Form.NEGATIVE, "iru"));
257         d.add(basicSuffix("itai", Form.WANT, "u"));
258         // further deinflect -eru
259         d.add(new EndsWithDeinflector("etai", Form.WANT, "eru"));
260         d.add(basicSuffix("eba", Form.IF2, "u"));
261         d.add(new EndsWithDeinflector("emasu", Form.ABLE_TO_DO2, "u", "eru"));
262         d.add(new EndsWithDeinflector("imasu", Form.POLITE, "u", "iru"));
263         d.add(basicSuffix("outosuru", null, "u"));
264         d.add(basicSuffix("ou to suru",null, "u"));
265         // this is dangerous - it will deinflect all ichidan verbs. however,
266         // this rule is also required, to correctly deinflect e.g.
267         // aetai. list as a last rule. Make the rule produce the old verb and
268         // also the deinflected one.
269         d.add(new EruDeinflector());
270         // and finally, the -ta and -te deinflectors
271         d.add(basicSuffix("sita", Form.PAST_TENSE, "su"));
272         d.add(basicSuffix("site",Form.CONTINUATION, "su"));
273         // -ite may be a godan -ku but also ichidan -iru verb
274         d.add(basicSuffix("ita", Form.PAST_TENSE, "ku", "iru"));
275         d.add(basicSuffix("ite",Form.CONTINUATION, "ku", "iru"));
276         // this is purely for ichidan -eru verb
277         d.add(basicSuffix("eta", Form.PAST_TENSE, "eru"));
278         d.add(basicSuffix("ete",Form.CONTINUATION, "eru"));
279         d.add(basicSuffix("ida", Form.PAST_TENSE, "gu"));
280         d.add(basicSuffix("ide",Form.CONTINUATION, "gu"));
281         d.add(basicSuffix("tta", Form.PAST_TENSE, "tu", "u", "ru"));
282         d.add(basicSuffix("tte",Form.CONTINUATION, "tu", "u", "ru"));
283         d.add(basicSuffix("nda", Form.PAST_TENSE, "nu", "bu", "mu"));
284         d.add(basicSuffix("nde",Form.CONTINUATION, "nu", "bu", "mu"));
285         DEINFLECTORS = d;
286     }
287 
288     /***
289      * Attempts to deinflect given verb.
290      *
291      * @param japanese
292      *            {@link RomanizationEnum#NihonShiki} romaji or hiragana.
293      * @return deinflected verb(s)
294      */
295     public static Deinflections deinflect(final String japanese) {
296         final Deinflections result = new Deinflections();
297         result.deinflections=new ArrayList<Deinflection>();
298         result.deinflectedVerbs = new HashSet<String>();
299         Set<String> finalDeinflect = new HashSet<String>();
300         result.deinflectedVerbs.add(RomanizationEnum.NihonShiki.toRomaji(japanese).trim());
301         for (final AbstractDeinflector deinflector : DEINFLECTORS) {
302             final Set<String> newResult = new HashSet<String>(result.deinflectedVerbs);
303             for (final String romaji : result.deinflectedVerbs) {
304                 final Set<String> deinflected = deinflector.deinflect(romaji);
305                 if (deinflected != null && !deinflected.isEmpty()) {
306                     // successfully deinflected. remove the old verb and add the
307                     // deinflected one.
308                     newResult.remove(romaji);
309                     if (deinflector.stopIfMatch()) {
310                         finalDeinflect.addAll(deinflected);
311                     } else {
312                         newResult.addAll(deinflected);
313                     }
314                     result.deinflections.add(new Deinflection(romaji, deinflector.getForm(), deinflected.toArray(new String[0])));
315                 }
316             }
317             result.deinflectedVerbs = newResult;
318         }
319         result.deinflectedVerbs.addAll(finalDeinflect);
320         return result;
321     }
322 
323     private VerbDeinflection() {
324         throw new AssertionError();
325     }
326 
327     /***
328      * Creates an EDICT query which searches for a japanese term. Automatically performs a verb deinflection.
329      *
330      * @param verb
331      *            the word to search, in japanese language, may contain romaji.
332      *            Full-width katakana conversion is performed automatically. Not
333      *            null
334      * @param romanization
335      *            the romanization system to use, not null.
336      * @return search query, never null
337      */
338     public static Deinflections searchJpDeinflected(final String verb, final RomanizationEnum romanization) {
339         final SearchQuery result = new SearchQuery(DictTypeEnum.Edict);
340         final String conv = KanjiUtils.halfwidthToKatakana(verb);
341         final String romaji = RomanizationEnum.NihonShiki.toRomaji(romanization.toHiragana(conv));
342         final Deinflections deinflections = VerbDeinflection.deinflect(romaji);
343         result.query = deinflections.deinflectedVerbs.toArray(new String[0]);
344         for (int i = 0; i < result.query.length; i++) {
345             result.query[i] = RomanizationEnum.NihonShiki.toHiragana(result.query[i]);
346         }
347         result.isJapanese = true;
348         result.matcher = MatcherEnum.Exact;
349         deinflections.query = result;
350         return deinflections;
351     }
352 }