How to use the pyserini.analysis.get_lucene_analyzer function in pyserini

To help you get started, we’ve selected a few pyserini examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / pyserini / tests / test_analysis.py View on Github external
def test_analysis(self):
        # Default is Porter stemmer
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer())
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Porter stemmer explicitly
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemmer='porter'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'run', 'time'])

        # Specify Krovetz stemmer explicitly
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemmer='krovetz'))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time'])
github castorini / pyserini / tests / test_index_reader.py View on Github external
def test_analyze(self):
        self.assertEqual(' '.join(self.index_reader.analyze('retrieval')), 'retriev')
        self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy')),
                         'rapid retriev space economi')
        tokenizer = analysis.get_lucene_analyzer(stemming=False)
        self.assertEqual(' '.join(self.index_reader.analyze('retrieval', analyzer=tokenizer)), 'retrieval')
        self.assertEqual(' '.join(self.index_reader.analyze('rapid retrieval, space economy', analyzer=tokenizer)),
                         'rapid retrieval space economy')
        # Test utf encoding:
        self.assertEqual(self.index_reader.analyze('zoölogy')[0], 'zoölog')
        self.assertEqual(self.index_reader.analyze('zoölogy', analyzer=tokenizer)[0], 'zoölogy')
github castorini / pyserini / tests / test_analysis.py View on Github external
def test_different_analyzers_are_different(self):
        self.searcher.set_analyzer(analysis.get_lucene_analyzer(stemming=False))
        hits_first = self.searcher.search('information retrieval')
        self.searcher.set_analyzer(analysis.get_lucene_analyzer())
        hits_second = self.searcher.search('information retrieval')
        self.assertNotEqual(hits_first, hits_second)
github castorini / pyserini / tests / test_analysis.py View on Github external
def test_different_analyzers_are_different(self):
        self.searcher.set_analyzer(analysis.get_lucene_analyzer(stemming=False))
        hits_first = self.searcher.search('information retrieval')
        self.searcher.set_analyzer(analysis.get_lucene_analyzer())
        hits_second = self.searcher.search('information retrieval')
        self.assertNotEqual(hits_first, hits_second)
github castorini / pyserini / tests / test_analysis.py View on Github external
self.assertEqual(tokens, ['city', 'bus', 'running', 'time'])

        # No stemming
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'running', 'time'])

        # No stopword filter, no stemming
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=False, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['city', 'buses', 'are', 'running', 'on', 'time'])

        # No stopword filter, with stemming
        analyzer = analysis.Analyzer(analysis.get_lucene_analyzer(stemming=True, stopwords=False))
        self.assertTrue(isinstance(analyzer, Analyzer))
        tokens = analyzer.analyze('City buses are running on time.')
        self.assertEqual(tokens, ['citi', 'buse', 'ar', 'run', 'on', 'time'])
github castorini / pyserini / pyserini / index / _base.py View on Github external
``Analyzer`` if analyzer is not specified.

        Parameters
        ----------
        term : str
            Unanalyzed term.
        analyzer : analyzer
            Analyzer to apply.

        Returns
        -------
        Tuple[int, int]
            Document frequency and collection frequency.
        """
        if analyzer is None:
            analyzer = get_lucene_analyzer(stemming=False, stopwords=False)

        term_map = self.object.getTermCountsWithAnalyzer(self.reader, JString(term.encode('utf-8')), analyzer)

        return term_map.get(JString('docFreq')), term_map.get(JString('collectionFreq'))
github castorini / pyserini / pyserini / index / _base.py View on Github external
    def compute_bm25_term_weight(self, docid: str, term: str, analyzer=get_lucene_analyzer(), k1=0.9, b=0.4) -> float:
        """Compute the BM25 weight of a term in a document. Specify ``analyzer=None`` for an already analyzed term,
        e.g., from the output of :func:`get_document_vector`.

        Parameters
        ----------
        docid : str
            Collection ``docid``.
        term : str
            Term.
        analyzer : analyzer
            Lucene analyzer to use, ``None`` if term is already analyzed.
        k1 : float
            BM25 k1 parameter.
        b : float
            BM25 b parameter.
github castorini / pyserini / pyserini / index / _base.py View on Github external
    def get_postings_list(self, term: str, analyzer=get_lucene_analyzer()) -> List[Posting]:
        """Return the postings list for a term.

        Parameters
        ----------
        term : str
            Raw term.
        analyzer : analyzer
            Analyzer to apply. Defaults to Anserini's default.

        Returns
        -------
        List[Posting]
            List of :class:`Posting` objects corresponding to the postings list for the term.
        """
        if analyzer is None:
            postings_list = self.object.getPostingsListForAnalyzedTerm(self.reader, JString(term.encode('utf-8')))
github castorini / pyserini / pyserini / search / querybuilder.py View on Github external
def get_term_query(term, field="contents", analyzer=get_lucene_analyzer()):
    """Searches the collection.

    Parameters
    ----------
    term : str
        The query term string.
    field : str
        Field to search.
    analyzer : Analyzer
        Analyzer to use for tokenizing the query term.

    Returns
    -------
    JTermQuery
    """
    analyzer = Analyzer(analyzer)