Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
url = link.url.split('#', 1)[0]
# Check for VCS schemes that do not support lookup as web pages.
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
return None
# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)
try:
resp = _get_html_response(url, session=session)
except _NotHTTP:
logger.debug(
'Skipping page %s because it looks like an archive, and cannot '
'be checked by HEAD.', link,
)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because the %s request got Content-Type: %s',
link, exc.request_desc, exc.content_type,
)
except HTTPError as exc:
_handle_get_page_fail(link, exc)
url = link.url.split('#', 1)[0]
# Check for VCS schemes that do not support lookup as web pages.
vcs_scheme = _match_vcs_scheme(url)
if vcs_scheme:
logger.debug('Cannot look at %s URL %s', vcs_scheme, link)
return None
# Tack index.html onto file:// URLs that point to directories
scheme, _, path, _, _, _ = urllib_parse.urlparse(url)
if (scheme == 'file' and os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)
try:
resp = _get_html_response(url, session=session)
except _NotHTTP:
logger.debug(
'Skipping page %s because it looks like an archive, and cannot '
'be checked by HEAD.', link,
)
except _NotHTML as exc:
logger.debug(
'Skipping page %s because the %s request got Content-Type: %s',
link, exc.request_desc, exc.content_type,
)
except HTTPError as exc:
_handle_get_page_fail(link, exc)
content_type,
)
return
logger.debug('Getting page %s', url)
# Tack index.html onto file:// URLs that point to directories
(scheme, netloc, path, params, query, fragment) = \
urllib_parse.urlparse(url)
if (scheme == 'file' and
os.path.isdir(urllib_request.url2pathname(path))):
# add trailing slash if not present so urljoin doesn't trim
# final segment
if not url.endswith('/'):
url += '/'
url = urllib_parse.urljoin(url, 'index.html')
logger.debug(' file: URL is directory, getting %s', url)
resp = session.get(
url,
headers={
"Accept": "text/html",
# We don't want to blindly returned cached data for
# /simple/, because authors generally expecting that
# twine upload && pip install will function, but if
# they've done a pip install in the last ~10 minutes
# it won't. Thus by setting this to zero we will not
# blindly use any cached data, however the benefit of
# using max-age=0 instead of no-cache, is that we will
# still support conditional requests, so we will still
# minimize traffic sent in cases where the page hasn't
# changed at all, we will just always incur the round
def links(self):
"""Yields all links in the page"""
for anchor in self.parsed.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = self.clean_link(
urllib_parse.urljoin(self.base_url, href)
)
# Determine if this link is internal. If that distinction
# doesn't make sense in this context, then we don't make
# any distinction.
internal = None
if self.api_version and self.api_version >= 2:
# Only api_versions >= 2 have a distinction between
# external and internal links
internal = bool(
anchor.get("rel")
and "internal" in anchor.get("rel").split()
)
yield Link(url, self, internal=internal)
def url_to_path(self, path):
return urllib_parse.urljoin(self.url, path)
def iter_links(self):
# type: () -> Iterable[Link]
"""Yields all links in the page"""
document = html5lib.parse(
self.content,
transport_encoding=_get_encoding_from_headers(self.headers),
namespaceHTMLElements=False,
)
base_url = _determine_base_url(document, self.url)
for anchor in document.findall(".//a"):
if anchor.get("href"):
href = anchor.get("href")
url = _clean_link(urllib_parse.urljoin(base_url, href))
pyrequire = anchor.get('data-requires-python')
pyrequire = unescape(pyrequire) if pyrequire else None
yield Link(url, self.url, requires_python=pyrequire)
use_pep517=use_pep517,
constraint=constraint, isolated=isolated, wheel_cache=wheel_cache
)
# parse a nested requirements file
elif opts.requirements or opts.constraints:
if opts.requirements:
req_path = opts.requirements[0]
nested_constraint = False
else:
req_path = opts.constraints[0]
nested_constraint = True
# original file is over http
if SCHEME_RE.search(filename):
# do a url join so relative paths work
req_path = urllib_parse.urljoin(filename, req_path)
# original file and nested file are paths
elif not SCHEME_RE.search(req_path):
# do a join so relative paths work
req_path = os.path.join(os.path.dirname(filename), req_path)
# TODO: Why not use `comes_from='-r {} (line {})'` here as well?
parsed_reqs = parse_requirements(
req_path, finder, comes_from, options, session,
constraint=nested_constraint, wheel_cache=wheel_cache
)
for req in parsed_reqs:
yield req
# percolate hash-checking option upward
elif opts.require_hashes:
options.require_hashes = opts.require_hashes
use_pep517=use_pep517,
constraint=constraint, isolated=isolated, wheel_cache=wheel_cache
)
# parse a nested requirements file
elif opts.requirements or opts.constraints:
if opts.requirements:
req_path = opts.requirements[0]
nested_constraint = False
else:
req_path = opts.constraints[0]
nested_constraint = True
# original file is over http
if SCHEME_RE.search(filename):
# do a url join so relative paths work
req_path = urllib_parse.urljoin(filename, req_path)
# original file and nested file are paths
elif not SCHEME_RE.search(req_path):
# do a join so relative paths work
req_path = os.path.join(os.path.dirname(filename), req_path)
# TODO: Why not use `comes_from='-r {} (line {})'` here as well?
parsed_reqs = parse_requirements(
req_path, finder, comes_from, options, session,
constraint=nested_constraint, wheel_cache=wheel_cache
)
for req in parsed_reqs:
yield req
# percolate hash-checking option upward
elif opts.require_hashes:
options.require_hashes = opts.require_hashes
def _url_for_path(self, path):
# type: (str) -> str
return urllib_parse.urljoin(self.url, path)