Spaces:
Runtime error
Runtime error
import re | |
URL_REGEX = re.compile( | |
r"(?:^|(?<![\w\/\.]))" | |
# protocol identifier | |
# r"(?:(?:https?|ftp)://)" <-- alt? | |
r"(?:(?:https?:\/\/|ftp:\/\/|www\d{0,3}\.))" | |
# user:pass authentication | |
r"(?:\S+(?::\S*)?@)?" r"(?:" | |
# IP address exclusion | |
# private & local networks | |
r"(?!(?:10|127)(?:\.\d{1,3}){3})" | |
r"(?!(?:169\.254|192\.168)(?:\.\d{1,3}){2})" | |
r"(?!172\.(?:1[6-9]|2\d|3[0-1])(?:\.\d{1,3}){2})" | |
# IP address dotted notation octets | |
# excludes loopback network 0.0.0.0 | |
# excludes reserved space >= 224.0.0.0 | |
# excludes network & broadcast addresses | |
# (first & last IP address of each class) | |
r"(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])" | |
r"(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5])){2}" | |
r"(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))" | |
r"|" | |
# host name | |
r"(?:(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)" | |
# domain name | |
r"(?:\.(?:[a-z\\u00a1-\\uffff0-9]-?)*[a-z\\u00a1-\\uffff0-9]+)*" | |
# TLD identifier | |
r"(?:\.(?:[a-z\\u00a1-\\uffff]{2,}))" r"|" r"(?:(localhost))" r")" | |
# port number | |
r"(?::\d{2,5})?" | |
# resource path | |
r"(?:\/[^\)\]\}\s]*)?", | |
# r"(?:$|(?![\w?!+&\/\)]))", | |
# @jfilter: I removed the line above from the regex because I don't understand what it is used for, maybe it was useful? | |
# But I made sure that it does not include ), ] and } in the URL. | |
flags=re.UNICODE | re.IGNORECASE, | |
) | |