Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
parser.add_argument(
'input_file', nargs='?', metavar='',
type=argparse.FileType(), default=sys.stdin,
help='input text file with URLs to extract')
parsed_args = parser.parse_args()
return parsed_args
args = get_args()
logging.basicConfig(
level=logging.INFO, stream=sys.stderr,
format='%(asctime)s - %(levelname)s (%(name)s): %(message)s')
logger = logging.getLogger('urlextract')
try:
urlextract = URLExtract()
if args.ignore_file:
urlextract.load_ignore_list(args.ignore_file)
urlextract.update_when_older(30)
content = args.input_file.read()
for url in urlextract.find_urls(content, args.unique):
print(url)
except CacheFileError as e:
logger.error(str(e))
sys.exit(-1)
finally:
args.input_file.close()
def __init__(self, extract_email=False, **kwargs):
"""
Initialize function for URLExtract class.
Tries to get cached TLDs, if cached file does not exist it will try
to download new list from IANA and save it to cache file.
:param bool extract_email: True if we want to extract email from text.
Disabled by default
"""
super(URLExtract, self).__init__(**kwargs)
self._tlds_re = None
self._reload_tlds_from_file()
self._extract_email = extract_email
# general stop characters
general_stop_chars = {'\"', '<', '>', ';'}
# defining default stop chars left
self._stop_chars_left = set(string.whitespace)
self._stop_chars_left |= general_stop_chars | {'|', '=', ']', ')', '}'}
# defining default stop chars left
self._stop_chars_right = set(string.whitespace)
self._stop_chars_right |= general_stop_chars
# preprocessed union _stop_chars is used in _validate_tld_match