reddit-image-wall-getter/reddit_imgs/system/urlmatcher.py

28 lines
946 B
Python
Raw Permalink Normal View History

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import re
from typing import List
2020-07-20 01:54:26 +00:00
HREF_RGX1_PTTRN = r'href=\"([^"]*)\"'
HREF_RGX2_PTTRN = r"href=\'([^']*)\'"
SRC_RGX1_PTTRN = r'src=\"([^"]*)\"'
SRC_RGX2_PTTRN = r"src=\'([^']*)\'"
URL_RGX_PTTRN = r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
2020-07-20 01:54:26 +00:00
HREF_RGX1 = re.compile(HREF_RGX1_PTTRN)
HREF_RGX2 = re.compile(HREF_RGX2_PTTRN)
SRC_RGX1 = re.compile(SRC_RGX1_PTTRN)
SRC_RGX2 = re.compile(SRC_RGX2_PTTRN)
URL_RGX = re.compile(URL_RGX_PTTRN)
def search_urls(text: str) -> List[str]:
2020-07-20 01:54:26 +00:00
return list(set(
list(map(lambda l: l.group(0), URL_RGX.finditer(text))) +
list(map(lambda l: l.group(1), SRC_RGX1.finditer(text))) +
list(map(lambda l: l.group(1), SRC_RGX2.finditer(text))) +
list(map(lambda l: l.group(1), HREF_RGX1.finditer(text))) +
list(map(lambda l: l.group(1), HREF_RGX2.finditer(text))) +
[]
))