Spaces:
Runtime error
Runtime error
# | |
# Pyserini: Reproducible IR research with sparse and dense representations | |
# | |
# Licensed under the Apache License, Version 2.0 (the "License"); | |
# you may not use this file except in compliance with the License. | |
# You may obtain a copy of the License at | |
# | |
# http://www.apache.org/licenses/LICENSE-2.0 | |
# | |
# Unless required by applicable law or agreed to in writing, software | |
# distributed under the License is distributed on an "AS IS" BASIS, | |
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
# See the License for the specific language governing permissions and | |
# limitations under the License. | |
# | |
from typing import List | |
from ..pyclass import autoclass | |
# Wrappers around Lucene classes | |
JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer') | |
JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer') | |
JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer') | |
JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer') | |
JDanishAnalyzer = autoclass('org.apache.lucene.analysis.da.DanishAnalyzer') | |
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') | |
JDutchAnalyzer = autoclass('org.apache.lucene.analysis.nl.DutchAnalyzer') | |
JFinnishAnalyzer = autoclass('org.apache.lucene.analysis.fi.FinnishAnalyzer') | |
JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer') | |
JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer') | |
JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer') | |
JHungarianAnalyzer = autoclass('org.apache.lucene.analysis.hu.HungarianAnalyzer') | |
JIndonesianAnalyzer = autoclass('org.apache.lucene.analysis.id.IndonesianAnalyzer') | |
JItalianAnalyzer = autoclass('org.apache.lucene.analysis.it.ItalianAnalyzer') | |
JJapaneseAnalyzer = autoclass('org.apache.lucene.analysis.ja.JapaneseAnalyzer') | |
JNorwegianAnalyzer = autoclass('org.apache.lucene.analysis.no.NorwegianAnalyzer') | |
JPortugueseAnalyzer = autoclass('org.apache.lucene.analysis.pt.PortugueseAnalyzer') | |
JRussianAnalyzer = autoclass('org.apache.lucene.analysis.ru.RussianAnalyzer') | |
JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer') | |
JSwedishAnalyzer = autoclass('org.apache.lucene.analysis.sv.SwedishAnalyzer') | |
JTeluguAnalyzer = autoclass('org.apache.lucene.analysis.te.TeluguAnalyzer') | |
JThaiAnalyzer = autoclass('org.apache.lucene.analysis.th.ThaiAnalyzer') | |
JTurkishAnalyzer = autoclass('org.apache.lucene.analysis.tr.TurkishAnalyzer') | |
JWhiteSpaceAnalyzer = autoclass('org.apache.lucene.analysis.core.WhitespaceAnalyzer') | |
JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet') | |
# Wrappers around Anserini classes | |
JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils') | |
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer') | |
JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer') | |
JHuggingFaceTokenizerAnalyzer = autoclass('io.anserini.analysis.HuggingFaceTokenizerAnalyzer') | |
def get_lucene_analyzer(language: str='en', stemming: bool=True, stemmer: str='porter', stopwords: bool=True, huggingFaceTokenizer: str=None) -> JAnalyzer: | |
"""Create a Lucene ``Analyzer`` with specific settings. | |
Parameters | |
---------- | |
language : str | |
Name of analyzer. | |
stemming : bool | |
Set to stem. | |
stemmer : str | |
Stemmer to use. | |
stopwords : bool | |
Set to filter stopwords. | |
huggingFaceTokenizer: str | |
a huggingface model id or path to a tokenizer.json file | |
Returns | |
------- | |
JAnalyzer | |
Java ``Analyzer`` with specified settings. | |
""" | |
if language.lower() == 'ar': | |
return JArabicAnalyzer() | |
elif language.lower() == 'bn': | |
return JBengaliAnalyzer() | |
elif language.lower() in ['zh', 'ko']: | |
return JCJKAnalyzer() | |
elif language.lower() == 'da': | |
return JDanishAnalyzer() | |
elif language.lower() == 'nl': | |
return JDutchAnalyzer() | |
elif language.lower() == 'fi': | |
return JFinnishAnalyzer() | |
elif language.lower() == 'fr': | |
return JFrenchAnalyzer() | |
elif language.lower() == 'de': | |
return JGermanAnalyzer() | |
elif language.lower() == 'hi': | |
return JHindiAnalyzer() | |
elif language.lower() == 'hu': | |
return JHungarianAnalyzer() | |
elif language.lower() == 'id': | |
return JIndonesianAnalyzer() | |
elif language.lower() == 'it': | |
return JItalianAnalyzer() | |
elif language.lower() == 'ja': | |
return JJapaneseAnalyzer() | |
elif language.lower() == 'no': | |
return JNorwegianAnalyzer() | |
elif language.lower() == 'pt': | |
return JPortugueseAnalyzer() | |
elif language.lower() == 'ru': | |
return JRussianAnalyzer() | |
elif language.lower() == 'es': | |
return JSpanishAnalyzer() | |
elif language.lower() == 'te': | |
return JTeluguAnalyzer() | |
elif language.lower() == 'th': | |
return JThaiAnalyzer() | |
elif language.lower() == 'tr': | |
return JTurkishAnalyzer() | |
elif language.lower() == 'tweet': | |
return JTweetAnalyzer() | |
elif language.lower() == 'hgf_tokenizer': | |
return JHuggingFaceTokenizerAnalyzer(huggingFaceTokenizer) | |
elif language.lower() == 'en': | |
if stemming: | |
if stopwords: | |
return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer) | |
else: | |
return JDefaultEnglishAnalyzer.newStemmingInstance(stemmer, JCharArraySet.EMPTY_SET) | |
else: | |
if stopwords: | |
return JDefaultEnglishAnalyzer.newNonStemmingInstance() | |
else: | |
return JDefaultEnglishAnalyzer.newNonStemmingInstance(JCharArraySet.EMPTY_SET) | |
else: | |
raise ValueError('Invalid configuration.') | |
class Analyzer: | |
"""Python wrapper around a Lucene ``Analyzer`` to simplify analysis. | |
Parameters | |
---------- | |
analyzer : JAnalyzer | |
Lucene ``Analyzer``. | |
""" | |
def __init__(self, analyzer): | |
if not isinstance(analyzer, JAnalyzer): | |
raise TypeError('Invalid JAnalyzer!') | |
self.analyzer = analyzer | |
def analyze(self, text: str) -> List[str]: | |
"""Analyze a piece of text. | |
Parameters | |
---------- | |
text : str | |
Text to analyze. | |
Returns | |
------- | |
List[str] | |
List of tokens corresponding to the output of the analyzer. | |
""" | |
results = JAnalyzerUtils.analyze(self.analyzer, text) | |
tokens = [] | |
for token in results.toArray(): | |
tokens.append(token) | |
return tokens | |