2020-06-03 03:27:13 +00:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
# -*- encoding: utf-8 -*-
|
|
|
|
|
|
|
|
import re
|
|
|
|
from typing import List
|
|
|
|
|
2020-07-20 01:54:26 +00:00
|
|
|
HREF_RGX1_PTTRN = r'href=\"([^"]*)\"'
|
|
|
|
HREF_RGX2_PTTRN = r"href=\'([^']*)\'"
|
|
|
|
SRC_RGX1_PTTRN = r'src=\"([^"]*)\"'
|
|
|
|
SRC_RGX2_PTTRN = r"src=\'([^']*)\'"
|
2020-06-03 03:27:13 +00:00
|
|
|
URL_RGX_PTTRN = r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
|
2020-07-20 01:54:26 +00:00
|
|
|
HREF_RGX1 = re.compile(HREF_RGX1_PTTRN)
|
|
|
|
HREF_RGX2 = re.compile(HREF_RGX2_PTTRN)
|
|
|
|
SRC_RGX1 = re.compile(SRC_RGX1_PTTRN)
|
|
|
|
SRC_RGX2 = re.compile(SRC_RGX2_PTTRN)
|
2020-06-03 03:27:13 +00:00
|
|
|
URL_RGX = re.compile(URL_RGX_PTTRN)
|
|
|
|
|
|
|
|
|
|
|
|
def search_urls(text: str) -> List[str]:
|
2020-07-20 01:54:26 +00:00
|
|
|
return list(set(
|
|
|
|
list(map(lambda l: l.group(0), URL_RGX.finditer(text))) +
|
|
|
|
list(map(lambda l: l.group(1), SRC_RGX1.finditer(text))) +
|
|
|
|
list(map(lambda l: l.group(1), SRC_RGX2.finditer(text))) +
|
|
|
|
list(map(lambda l: l.group(1), HREF_RGX1.finditer(text))) +
|
|
|
|
list(map(lambda l: l.group(1), HREF_RGX2.finditer(text))) +
|
|
|
|
[]
|
|
|
|
))
|