Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
done = 0
batch_urls = list(set([l for t in todo if not t.get("proper_links", []) for l in t.get('links', [])]))
alreadydone = {l["_id"]: l["real"] for l in linkscoll.find({"_id": {"$in": batch_urls}})}
urls_to_clear = []
for u in batch_urls:
if u in alreadydone:
continue
if u.startswith("https://twitter.com/") and "/status/" in u:
alreadydone[u] = u.replace("?s=19", "")
continue
urls_to_clear.append(u)
links_to_save = []
t = datetime.now().isoformat()
print(" + [%s] %s urls to resolve" % (t, len(urls_to_clear)))
try:
for res in multithreaded_resolve(
urls_to_clear,
threads=min(50, batch_size),
throttle=0.2,
max_redirects=20,
insecure=True,
timeout=Timeout(connect=10, read=30),
follow_meta_refresh=True
):
source = res.url
last = res.stack[-1]
if res.error and type(res.error) != RedirectError and not issubclass(type(res.error), RedirectError):
print("ERROR on resolving %s: %s (last url: %s)" % (source, res.error, last.url), file=sys.stderr)
continue
if verbose:
print(" ", last.status, "(%s)" % last.type, ":", source, "->", last.url, file=sys.stderr)
if len(source) < 1024: