How to use the pdfplumber.utils.cluster_objects function in pdfplumber

To help you get started, we’ve selected a few pdfplumber examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github jsvine / pdfplumber / pdfplumber / table.py View on Github external
def snap_edges(edges, tolerance=DEFAULT_SNAP_TOLERANCE):
    """
    Given a list of edges, snap any within `tolerance` pixels of one another to their positional average.
    """
    v, h = [ list(filter(lambda x: x["orientation"] == o, edges))
        for o in ("v", "h") ]

    v = [ move_to_avg(cluster, "v")
        for cluster in utils.cluster_objects(v, "x0", tolerance) ]

    h = [ move_to_avg(cluster, "h")
        for cluster in utils.cluster_objects(h, "top", tolerance) ]

    snapped = list(itertools.chain(*(v + h)))
    return snapped
github jsvine / pdfplumber / pdfplumber / table.py View on Github external
def words_to_edges_v(words,
    word_threshold=DEFAULT_MIN_WORDS_VERTICAL):
    """
    Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
    """
    # Find words that share the same left, right, or centerpoints
    by_x0 = utils.cluster_objects(words, "x0", 1)
    by_x1 = utils.cluster_objects(words, "x1", 1)
    by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1)
    clusters = by_x0 + by_x1 + by_center
    
    # Find the points that align with the most words
    sorted_clusters = sorted(clusters, key=lambda x: -len(x))
    large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
    
    # For each of those points, find the rectangles fitting all matching words
    rects = list(map(utils.objects_to_rect, large_clusters))
    
    # Iterate through those rectangles, condensing overlapping rectangles
    condensed_rects = []
    for rect in rects:
        overlap = False
        for c in condensed_rects:
github jsvine / pdfplumber / pdfplumber / table.py View on Github external
def snap_edges(edges, tolerance=DEFAULT_SNAP_TOLERANCE):
    """
    Given a list of edges, snap any within `tolerance` pixels of one another to their positional average.
    """
    v, h = [ list(filter(lambda x: x["orientation"] == o, edges))
        for o in ("v", "h") ]

    v = [ move_to_avg(cluster, "v")
        for cluster in utils.cluster_objects(v, "x0", tolerance) ]

    h = [ move_to_avg(cluster, "h")
        for cluster in utils.cluster_objects(h, "top", tolerance) ]

    snapped = list(itertools.chain(*(v + h)))
    return snapped
github jsvine / pdfplumber / pdfplumber / table.py View on Github external
def words_to_edges_v(words,
    word_threshold=DEFAULT_MIN_WORDS_VERTICAL):
    """
    Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
    """
    # Find words that share the same left, right, or centerpoints
    by_x0 = utils.cluster_objects(words, "x0", 1)
    by_x1 = utils.cluster_objects(words, "x1", 1)
    by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1)
    clusters = by_x0 + by_x1 + by_center
    
    # Find the points that align with the most words
    sorted_clusters = sorted(clusters, key=lambda x: -len(x))
    large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
    
    # For each of those points, find the rectangles fitting all matching words
    rects = list(map(utils.objects_to_rect, large_clusters))
    
    # Iterate through those rectangles, condensing overlapping rectangles
    condensed_rects = []
    for rect in rects:
        overlap = False
        for c in condensed_rects:
            if utils.objects_overlap(rect, c):
                overlap = True
github jsvine / pdfplumber / pdfplumber / table.py View on Github external
def words_to_edges_v(words,
    word_threshold=DEFAULT_MIN_WORDS_VERTICAL):
    """
    Find (imaginary) vertical lines that connect the left, right, or center of at least `word_threshold` words.
    """
    # Find words that share the same left, right, or centerpoints
    by_x0 = utils.cluster_objects(words, "x0", 1)
    by_x1 = utils.cluster_objects(words, "x1", 1)
    by_center = utils.cluster_objects(words, lambda x: (x["x0"] + x["x1"])/2, 1)
    clusters = by_x0 + by_x1 + by_center
    
    # Find the points that align with the most words
    sorted_clusters = sorted(clusters, key=lambda x: -len(x))
    large_clusters = filter(lambda x: len(x) >= word_threshold, sorted_clusters)
    
    # For each of those points, find the rectangles fitting all matching words
    rects = list(map(utils.objects_to_rect, large_clusters))
    
    # Iterate through those rectangles, condensing overlapping rectangles
    condensed_rects = []
    for rect in rects:
        overlap = False
        for c in condensed_rects:
            if utils.objects_overlap(rect, c):