How to use the ural.normalize_url function in ural

To help you get started, we’ve selected a few ural examples, based on popular ways it is used in public projects.

Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.

github medialab / gazouilloire / bin / export_shared_domains.py View on Github external
with open('config.json') as confile:
    conf = json.loads(confile.read())

db = MongoClient(conf['mongo']['host'], conf['mongo']['port'])[conf['mongo']['db']]['tweets']

urls = defaultdict(int)
query = {}
#query["langs"] = "fr"

print "Counting matching results..."
count = db.count(query)
print "Querying and hashing results..."
bar = progressbar.ProgressBar(max_value=count)
for t in bar(db.find(query, limit=count, projection={"links": 1, "proper_links": 1})):
    for l in t.get("proper_links", t["links"]):
        d = normalize_url(l.split("/")[2])
        urls[d] += 1

print "Sorting and storing csv data..."
with open("shared_domains.csv", "w") as f:
    print >> f, "domain,shares"
    bar = progressbar.ProgressBar(max_value=len(urls))
    for link, shares in bar(sorted(urls.items(), key = lambda x: -x[1])):
        print >> f, '%s,%s' % (format_csv(link), shares)
github medialab / gazouilloire / bin / export_users_urls_domains.py View on Github external
with open("users_urls_domains.csv", "w") as f:
    print >> f, "user_screenname,user_id,url,normalized_url,domain,datetime,is_retweet,followers,has_media"
    bar = progressbar.ProgressBar(max_value=count)
    for t in bar(db.find(query, limit=count, projection={"user_screen_name": 1, "user_id_str": 1, "links": 1, "proper_links": 1, "retweet_id": 1, "created_at": 1, "user_followers": 1, "medias": 1})):
        links = t.get("proper_links", t["links"])
        if not links:
            continue
        name = t.get("user_screen_name")
        uid = t.get("user_id_str")
        isRT = 1 if t["retweet_id"] else 0
        fols = t["user_followers"]
        media = 1 if t["medias"] else 0
        dtime = isodate(t["created_at"])
        for l in links:
            try:
                lnk = normalize_url(l.encode("utf-8").replace("%0D", ""), strip_trailing_slash=True, strip_lang_subdomains=True)
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing url", l, type(e), e
                lnk = l
            try:
                domain = normalize_url(l.split("/")[2])
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e
                domain = ""
            print >> f, ",".join([format_csv(v) for v in [name, uid, l, lnk, domain, dtime, str(isRT), str(fols), str(media)]])
github medialab / gazouilloire / bin / export_users_urls_domains.py View on Github external
if not links:
            continue
        name = t.get("user_screen_name")
        uid = t.get("user_id_str")
        isRT = 1 if t["retweet_id"] else 0
        fols = t["user_followers"]
        media = 1 if t["medias"] else 0
        dtime = isodate(t["created_at"])
        for l in links:
            try:
                lnk = normalize_url(l.encode("utf-8").replace("%0D", ""), strip_trailing_slash=True, strip_lang_subdomains=True)
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing url", l, type(e), e
                lnk = l
            try:
                domain = normalize_url(l.split("/")[2])
            except Exception as e:
                print >> sys.stderr, "ERROR normalizing domain for url", l, type(e), e
                domain = ""
            print >> f, ",".join([format_csv(v) for v in [name, uid, l, lnk, domain, dtime, str(isRT), str(fols), str(media)]])
github medialab / ural / scripts / analysis.py View on Github external
from urllib.parse import urlsplit, parse_qsl
from collections import Counter
from tqdm import tqdm

from ural import normalize_url

TOP = 50

FRAGMENTS = Counter()
QUERIES = Counter()
QUERIES_COMBO = Counter()

with open('./scripts/data/urls.csv') as f:
    for line in tqdm(f, desc='Reading urls'):
        url = line.strip()[1:-1]
        url = normalize_url(url, strip_protocol=False)
        parsed = urlsplit(url)

        FRAGMENTS[parsed.fragment] += 1

        if parsed.query:
            for name, value in parse_qsl(parsed.query):
                QUERIES[name] += 1
                QUERIES_COMBO['%s=%s' % (name, value)] += 1

def report(name, counter):
    print()

    title = 'Top %i %s:' % (TOP, name)
    print(title)
    print('-' * len(title))
github medialab / ural / ural / cli / normalize.py View on Github external
def normalize_action(namespace):
    sort_query = not namespace.no_query_sort
    strip_authentication = not namespace.keep_authentication
    strip_trailing_slash = namespace.strip_trailing_slash
    strip_index = not namespace.keep_index

    headers, position, reader = custom_reader(namespace.file, namespace.column)

    headers.append(namespace.column + "_normalized")
    writer = csv.writer(namespace.output)
    writer.writerow(headers)

    for line in reader:
        url = line[position]
        line.append(normalize_url(url, sort_query=sort_query, strip_authentication=strip_authentication,
                                  strip_trailing_slash=strip_trailing_slash, strip_index=strip_index))
        writer.writerow(line)