How to use the libratom.lib.concurrency.get_messages function in libratom

To help you get started, we’ve selected a few libratom examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github libratom / libratom / tests / unit / test_libratom.py View on Github external
def test_attachments_mime_type_validation(enron_dataset, mock_progress_callback):

    files = get_set_of_files(enron_dataset)

    for res in get_messages(files, progress_callback=mock_progress_callback):
        attachments = res.get("attachments")
        if attachments:
            for attachment in attachments:
                try:
                    assert attachment.mime_type in MIME_TYPES
                except AssertionError:
                    # Some enron files have these obsolete attachment types
                    assert attachment.mime_type in [
                        "application/msexcell",
                        "application/mspowerpoint",
                    ]
github libratom / libratom / tests / unit / test_libratom.py View on Github external
def test_get_messages_with_bad_files(enron_dataset_part044, mock_progress_callback):

    _count = 0
    for _count, res in enumerate(
        get_messages(
            files=enron_dataset_part044.glob("*.pst"),
            progress_callback=mock_progress_callback,
        ),
        start=1,
    ):
        assert res

    assert _count == 558
github libratom / libratom / libratom / lib / entities.py View on Github external
# Load the file_report table for local lookup
    _file_reports = session.query(FileReport).all()  # noqa: F841

    # Start of multiprocessing
    with multiprocessing.Pool(processes=jobs, initializer=worker_init) as pool:

        logger.debug(f"Starting pool with {pool._processes} processes")

        new_entities = []
        msg_count = 0

        try:
            for msg_count, worker_output in enumerate(
                pool.imap_unordered(
                    process_message,
                    get_messages(
                        files,
                        spacy_model=spacy_model,
                        progress_callback=processing_update_progress,
                        include_message_contents=include_message_contents,
                        with_headers=include_message_contents,
                        **kwargs,
                    ),
                    chunksize=RATOM_MSG_BATCH_SIZE,
                ),
                start=1,
            ):

                # Unpack worker job output
                res, error = worker_output

                if error:
github libratom / libratom / libratom / lib / report.py View on Github external
Store full archive report in the DB
    """

    # Confirm environment settings
    for key, value in get_ratom_settings():
        logger.debug(f"{key}: {value}")

    # Default progress callback to no-op
    update_progress = progress_callback or (lambda *_, **__: None)

    # Load the file_report table for local lookup
    _file_reports = session.query(FileReport).all()  # noqa: F841

    try:

        for msg_info in get_messages(
            files,
            progress_callback=update_progress,
            with_content=include_message_contents,
            with_headers=include_message_contents,
        ):

            # Extract results
            message_id = msg_info.pop("message_id")
            filepath = msg_info.pop("filepath")
            attachments = msg_info.pop("attachments")

            if include_message_contents:
                msg_info["body"] = cleanup_message_body(
                    msg_info["body"], msg_info.pop("body_type")
                )