How to use the pyserini.pyclass.autoclass function in pyserini

To help you get started, we’ve selected a few pyserini examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / pyserini / pyserini / search / _base.py View on Github external
#

"""
This module provides Pyserini's Python search interface to Anserini. The main entry point is the ``SimpleSearcher``
class, which wraps the Java class with the same name in Anserini.
"""

import logging

from ..pyclass import autoclass, JPaths

logger = logging.getLogger(__name__)


# Wrappers around Lucene classes
JQuery = autoclass('org.apache.lucene.search.Query')
JDocument = autoclass('org.apache.lucene.document.Document')

# Wrappers around Anserini classes
JTopicReader = autoclass('io.anserini.search.topicreader.TopicReader')
JTopics = autoclass('io.anserini.search.topicreader.Topics')
JQueryGenerator = autoclass('io.anserini.search.query.QueryGenerator')
JBagOfWordsQueryGenerator = autoclass('io.anserini.search.query.BagOfWordsQueryGenerator')
JCovid19QueryGenerator = autoclass('io.anserini.search.query.Covid19QueryGenerator')


class Document:
    """Wrapper class for a Lucene ``Document``.

    Parameters
    ----------
    document : JDocument
github castorini / pyserini / pyserini / index / _base.py View on Github external
def JArgs():
        args = autoclass('io.anserini.index.IndexArgs')()
        args.storeContents = True
        args.storeRaw = True
        args.dryRun = True ## So that indexing will be skipped
        return args
github castorini / pyserini / pyserini / search / _searcher.py View on Github external
def bm25(k1=0.9, b=0.4):
        return autoclass('org.apache.lucene.search.similarities.BM25Similarity')(k1, b)
github castorini / pyserini / pyserini / analysis / _base.py View on Github external
from ..pyclass import autoclass, JString

# Wrappers around Lucene classes
JAnalyzer = autoclass('org.apache.lucene.analysis.Analyzer')
JArabicAnalyzer = autoclass('org.apache.lucene.analysis.ar.ArabicAnalyzer')
JBengaliAnalyzer = autoclass('org.apache.lucene.analysis.bn.BengaliAnalyzer')
JCJKAnalyzer = autoclass('org.apache.lucene.analysis.cjk.CJKAnalyzer')
JGermanAnalyzer = autoclass('org.apache.lucene.analysis.de.GermanAnalyzer')
JSpanishAnalyzer = autoclass('org.apache.lucene.analysis.es.SpanishAnalyzer')
JFrenchAnalyzer = autoclass('org.apache.lucene.analysis.fr.FrenchAnalyzer')
JHindiAnalyzer = autoclass('org.apache.lucene.analysis.hi.HindiAnalyzer')
JDefaultEnglishAnalyzer = autoclass('io.anserini.analysis.DefaultEnglishAnalyzer')
JCharArraySet = autoclass('org.apache.lucene.analysis.CharArraySet')

# Wrappers around Anserini classes
JAnalyzerUtils = autoclass('io.anserini.analysis.AnalyzerUtils')
JFreebaseAnalyzer = autoclass('io.anserini.analysis.FreebaseAnalyzer')
JTweetAnalyzer = autoclass('io.anserini.analysis.TweetAnalyzer')


def get_lucene_analyzer(name='english', stemming=True, stemmer='porter', stopwords=True) -> JAnalyzer:
    """Create a Lucene ``Analyzer`` with specific settings.

    Parameters
    ----------
    name : str
        Name of analyzer.
    stemming : bool
        Set to stem.
    stemmer : str
        Stemmer to use.
    stopwords : bool
github castorini / pyserini / pyserini / index / _base.py View on Github external
args = autoclass('io.anserini.index.IndexArgs')()
        args.storeContents = True
        args.storeRaw = True
        args.dryRun = True ## So that indexing will be skipped
        return args

    def JCounters():
        IndexCollection = autoclass('io.anserini.index.IndexCollection')
        Counters = autoclass('io.anserini.index.IndexCollection$Counters')
        return Counters(IndexCollection)


class JGenerators(Enum):
    DefaultLuceneDocumentGenerator = autoclass('io.anserini.index.generator.DefaultLuceneDocumentGenerator')
    TweetGenerator = autoclass('io.anserini.index.generator.TweetGenerator')
    WapoGenerator = autoclass('io.anserini.index.generator.WashingtonPostGenerator')


class Generator:
    """Wrapper class for Anserini's generators.

    Parameters
    ----------
    generator_class : str
        Name of generator class to instantiate
    """

    def __init__(self, generator_class):
        self.counters = JIndexHelpers.JCounters()
        self.args = JIndexHelpers.JArgs()
        self.generator_class = generator_class
        self.object = self._get_generator()
github castorini / pyserini / pyserini / collection / _base.py View on Github external
logger = logging.getLogger(__name__)


JFileSegment = autoclass('io.anserini.collection.FileSegment')
JSourceDocument = autoclass('io.anserini.collection.SourceDocument')


class JCollections(Enum):
    CarCollection = autoclass('io.anserini.collection.CarCollection')
    Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
    ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
    ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
    HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
    JsonCollection = autoclass('io.anserini.collection.JsonCollection')
    NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
    TrecCollection = autoclass('io.anserini.collection.TrecCollection')
    TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
    TweetCollection = autoclass('io.anserini.collection.TweetCollection')
    WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
    WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')


class Collection:
    """
    Iterable wrapper class for Anserini's DocumentCollection.

    Parameters
    ----------
    collection_class : str
        Name of collection class to instantiate
    collection_path : str
        Path to directory containing collection
github castorini / pyserini / pyserini / search / querybuilder.py View on Github external
"""
This module provides Pyserini's Python interface query building for Anserini.
"""
import logging
from enum import Enum

from ..analysis import get_lucene_analyzer, Analyzer
from ..pyclass import autoclass

logger = logging.getLogger(__name__)


# Wrapper around Lucene clases
JTerm = autoclass('org.apache.lucene.index.Term')
JBooleanClause = autoclass('org.apache.lucene.search.BooleanClause')
JBoostQuery = autoclass('org.apache.lucene.search.BoostQuery')
JTermQuery = autoclass('org.apache.lucene.search.TermQuery')

# Wrappers around Anserini classes
JQueryGeneratorUtils = autoclass('io.anserini.search.query.QueryGeneratorUtils')


class JBooleanClauseOccur(Enum):
    should = JQueryGeneratorUtils.getBooleanClauseShould()
    must = JQueryGeneratorUtils.getBooleanClauseMust()
    must_not = JQueryGeneratorUtils.getBooleanClauseMustNot()
    filter = JQueryGeneratorUtils.getBooleanClauseFilter()


def get_boolean_query_builder():
    """Get a BooleanQueryBuilder object.
github castorini / pyserini / pyserini / search / _searcher.py View on Github external
def qld(mu=1000):
        return autoclass('org.apache.lucene.search.similarities.LMDirichletSimilarity')(mu)
github castorini / pyserini / pyserini / collection / _base.py View on Github external
JSourceDocument = autoclass('io.anserini.collection.SourceDocument')


class JCollections(Enum):
    CarCollection = autoclass('io.anserini.collection.CarCollection')
    Cord19AbstractCollection = autoclass('io.anserini.collection.Cord19AbstractCollection')
    ClueWeb09Collection = autoclass('io.anserini.collection.ClueWeb09Collection')
    ClueWeb12Collection = autoclass('io.anserini.collection.ClueWeb12Collection')
    HtmlCollection = autoclass('io.anserini.collection.HtmlCollection')
    JsonCollection = autoclass('io.anserini.collection.JsonCollection')
    NewYorkTimesCollection = autoclass('io.anserini.collection.NewYorkTimesCollection')
    TrecCollection = autoclass('io.anserini.collection.TrecCollection')
    TrecwebCollection = autoclass('io.anserini.collection.TrecwebCollection')
    TweetCollection = autoclass('io.anserini.collection.TweetCollection')
    WashingtonPostCollection = autoclass('io.anserini.collection.WashingtonPostCollection')
    WikipediaCollection = autoclass('io.anserini.collection.WikipediaCollection')


class Collection:
    """
    Iterable wrapper class for Anserini's DocumentCollection.

    Parameters
    ----------
    collection_class : str
        Name of collection class to instantiate
    collection_path : str
        Path to directory containing collection
    """

    def __init__(self, collection_class, collection_path):
        self.counters = Counters()
github castorini / pyserini / pyserini / index / _base.py View on Github external
and methods provided are meant only to provide tools for examining an index and are not optimized for computing over.
"""

import logging
from enum import Enum
from typing import Dict, Iterator, List, Optional, Tuple

from ..analysis import get_lucene_analyzer, JAnalyzer, JAnalyzerUtils
from ..pyclass import autoclass, JString
from ..search import Document

logger = logging.getLogger(__name__)


# Wrappers around Anserini classes
JIndexReader = autoclass('io.anserini.index.IndexReaderUtils')


class JIndexHelpers:
    def JArgs():
        args = autoclass('io.anserini.index.IndexArgs')()
        args.storeContents = True
        args.storeRaw = True
        args.dryRun = True ## So that indexing will be skipped
        return args

    def JCounters():
        IndexCollection = autoclass('io.anserini.index.IndexCollection')
        Counters = autoclass('io.anserini.index.IndexCollection$Counters')
        return Counters(IndexCollection)