How to use the libratom.lib.utils.cleanup_message_body function in libratom

To help you get started, we’ve selected a few libratom examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github libratom / libratom / tests / unit / test_cli.py View on Github external
)

    with db_session_from_cmd_out(result) as session:
        # Verify total message count
        assert session.query(Message).count() == 9297

        # Get message contents from DB
        msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
        headers, body = msg.headers, msg.body

        if expected.with_messages:
            # Access message directly and compare
            archive_file = list(enron_dataset_part027.glob("*.pst"))[0]
            with open_mail_archive(archive_file) as archive:
                message = archive.get_message_by_id(msg_id)
                assert cleanup_message_body(*archive.get_message_body(message)) == body
                assert archive.get_message_headers(message) == headers

        else:
            assert headers is None
            assert body is None
github libratom / libratom / tests / unit / test_libratom.py View on Github external
def test_cleanup_message_body(body, body_type, result):
    assert cleanup_message_body(body, body_type) == result
github libratom / libratom / tests / unit / test_cli.py View on Github external
# Run entity extraction job with message content flag on
    result = extract_entities(
        params, enron_dataset_part001, isolated_cli_runner, expected
    )

    # Get message contents from DB
    with db_session_from_cmd_out(result) as session:
        msg = session.query(Message).filter_by(pff_identifier=msg_id).one()
        headers, body = msg.headers, msg.body

    # Access message directly and compare
    archive_file = list(enron_dataset_part001.glob("*.pst"))[0]
    with open_mail_archive(archive_file) as archive:
        message = archive.get_message_by_id(msg_id)
        assert cleanup_message_body(*archive.get_message_body(message)) == body
        assert archive.get_message_headers(message) == headers
github libratom / libratom / libratom / lib / entities.py View on Github external
"""
    Job function for the worker processes
    """

    # Return basic types to avoid serialization issues
    res = {
        "filepath": filepath,
        "message_id": message_id,
        "date": date,
        "processing_start_time": datetime.utcnow(),
        "attachments": attachments,
    }

    try:
        # Extract entities from the message
        message_body = cleanup_message_body(
            body, body_type, RATOM_SPACY_MODEL_MAX_LENGTH
        )
        doc = spacy_model(message_body)
        res["entities"] = [(ent.text, ent.label_) for ent in doc.ents]

        res["processing_end_time"] = datetime.utcnow()

        if include_message_contents:
            res["body"] = message_body
            res["headers"] = headers

        return res, None

    except Exception as exc:
        return res, str(exc)
github libratom / libratom / libratom / lib / report.py View on Github external
try:

        for msg_info in get_messages(
            files,
            progress_callback=update_progress,
            with_content=include_message_contents,
            with_headers=include_message_contents,
        ):

            # Extract results
            message_id = msg_info.pop("message_id")
            filepath = msg_info.pop("filepath")
            attachments = msg_info.pop("attachments")

            if include_message_contents:
                msg_info["body"] = cleanup_message_body(
                    msg_info["body"], msg_info.pop("body_type")
                )

            # Create new message instance
            message = Message(pff_identifier=message_id, **msg_info)

            # Link message to a file_report
            try:
                file_report = session.query(FileReport).filter_by(path=filepath).one()
            except Exception as exc:
                file_report = None
                logger.info(
                    f"Unable to link message id {message_id} to a file. Error: {exc}"
                )

            message.file_report = file_report