How to use the pyserini.pyclass.JString function in pyserini

To help you get started, we’ve selected a few pyserini examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github castorini / pyserini / pyserini / search / _searcher.py View on Github external
List of search results.
        """
        hits = None
        if query_generator:
            hits = self.object.search(query_generator, JString(q), k)
        elif isinstance(q, JQuery):
            # Note that RM3 requires the notion of a query (string) to estimate the appropriate models. If we're just
            # given a Lucene query, it's unclear what the "query" is for this estimation. One possibility is to extract
            # all the query terms from the Lucene query, although this might yield unexpected behavior from the user's
            # perspective. Until we think through what exactly is the "right thing to do", we'll raise an exception
            # here explicitly.
            if self.is_using_rm3():
                raise NotImplementedError('RM3 incompatible with search using a Lucene query.')
            hits = self.object.search(q, k)
        else:
            hits = self.object.search(JString(q.encode('utf8')), k)

        docids = set()
        filtered_hits = []

        for hit in hits:
            if strip_segment_id is True:
                hit.docid = hit.docid.split('.')[0]

            if hit.docid in docids:
                continue

            filtered_hits.append(hit)

            if remove_dups is True:
                docids.add(hit.docid)
github castorini / pyserini / pyserini / index / _base.py View on Github external
term : str
            Term.
        analyzer : analyzer
            Lucene analyzer to use, ``None`` if term is already analyzed.
        k1 : float
            BM25 k1 parameter.
        b : float
            BM25 b parameter.

        Returns
        -------
        float
            BM25 weight of the term in the document, or 0 if the term does not exist in the document.
        """
        if analyzer is None:
            return self.object.getBM25AnalyzedTermWeightWithParameters(self.reader, JString(docid),
                                                                       JString(term.encode('utf-8')),
                                                                       float(k1), float(b))
        else:
            return self.object.getBM25UnanalyzedTermWeightWithParameters(self.reader, JString(docid),
                                                                         JString(term.encode('utf-8')), analyzer,
                                                                         float(k1), float(b))
github castorini / pyserini / pyserini / index / _base.py View on Github external
"""Return the :class:`Document` based on a ``field`` with ``id``. For example, this method can be used to fetch
        document based on alternative primary keys that have been indexed, such as an article's DOI.

        Parameters
        ----------
        field : str
            The field to look up.
        q : str
            The document's unique id.

        Returns
        -------
        Optional[Document]
            :class:`Document` whose ``field`` is ``id``.
        """
        lucene_document = self.object.documentByField(self.reader, JString(field), JString(q))
        if lucene_document is None:
            return None
        return Document(lucene_document)
github castorini / pyserini / pyserini / search / _searcher.py View on Github external
----------
        q : str
            Query string.
        f : str
            Additional field to search.
        boost : float
            Weight boost for additional field.
        k : int
            Number of hits to return.

        Returns
        -------
        List[JSimpleSearcherResult]
            List of document hits returned from search
        """
        return self.object.searchFields(JString(q), JString(f), float(boost), k)
github castorini / pyserini / pyserini / index / _base.py View on Github external
Parameters
        ----------
        term : str
            Raw term.
        analyzer : analyzer
            Analyzer to apply. Defaults to Anserini's default.

        Returns
        -------
        List[Posting]
            List of :class:`Posting` objects corresponding to the postings list for the term.
        """
        if analyzer is None:
            postings_list = self.object.getPostingsListForAnalyzedTerm(self.reader, JString(term.encode('utf-8')))
        else:
            postings_list = self.object.getPostingsListWithAnalyzer(self.reader, JString(term.encode('utf-8')),
                                                                    analyzer)

        if postings_list is None:
            return None

        result = []
        for posting in postings_list.toArray():
            result.append(Posting(posting.getDocid(), posting.getTF(), posting.getPositions()))
        return result
github castorini / pyserini / pyserini / index / _base.py View on Github external
k1 : float
            BM25 k1 parameter.
        b : float
            BM25 b parameter.

        Returns
        -------
        float
            BM25 weight of the term in the document, or 0 if the term does not exist in the document.
        """
        if analyzer is None:
            return self.object.getBM25AnalyzedTermWeightWithParameters(self.reader, JString(docid),
                                                                       JString(term.encode('utf-8')),
                                                                       float(k1), float(b))
        else:
            return self.object.getBM25UnanalyzedTermWeightWithParameters(self.reader, JString(docid),
                                                                         JString(term.encode('utf-8')), analyzer,
                                                                         float(k1), float(b))
github castorini / pyserini / pyserini / index / _base.py View on Github external
Parameters
        ----------
        term : str
            Unanalyzed term.
        analyzer : analyzer
            Analyzer to apply.

        Returns
        -------
        Tuple[int, int]
            Document frequency and collection frequency.
        """
        if analyzer is None:
            analyzer = get_lucene_analyzer(stemming=False, stopwords=False)

        term_map = self.object.getTermCountsWithAnalyzer(self.reader, JString(term.encode('utf-8')), analyzer)

        return term_map.get(JString('docFreq')), term_map.get(JString('collectionFreq'))
github castorini / pyserini / pyserini / analysis / _base.py View on Github external
def analyze(self, text: str) -> List[str]:
        """Analyze a piece of text.

        Parameters
        ----------
        text : str
            Text to analyze.

        Returns
        -------
        List[str]
            List of tokens corresponding to the output of the analyzer.
        """
        results = JAnalyzerUtils.analyze(self.analyzer, JString(text.encode('utf-8')))
        tokens = []
        for token in results.toArray():
            tokens.append(token)
        return tokens