#!/usr/bin/env python3 # -*- encoding: utf-8 -*- import re from typing import List HREF_RGX1_PTTRN = r'href=\"([^"]*)\"' HREF_RGX2_PTTRN = r"href=\'([^']*)\'" SRC_RGX1_PTTRN = r'src=\"([^"]*)\"' SRC_RGX2_PTTRN = r"src=\'([^']*)\'" URL_RGX_PTTRN = r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?' HREF_RGX1 = re.compile(HREF_RGX1_PTTRN) HREF_RGX2 = re.compile(HREF_RGX2_PTTRN) SRC_RGX1 = re.compile(SRC_RGX1_PTTRN) SRC_RGX2 = re.compile(SRC_RGX2_PTTRN) URL_RGX = re.compile(URL_RGX_PTTRN) def search_urls(text: str) -> List[str]: return list(set( list(map(lambda l: l.group(0), URL_RGX.finditer(text))) + list(map(lambda l: l.group(1), SRC_RGX1.finditer(text))) + list(map(lambda l: l.group(1), SRC_RGX2.finditer(text))) + list(map(lambda l: l.group(1), HREF_RGX1.finditer(text))) + list(map(lambda l: l.group(1), HREF_RGX2.finditer(text))) + [] ))