reddit-image-wall-getter/reddit_imgs/system/urlmatcher.py

28 lines
946 B
Python

#!/usr/bin/env python3
# -*- encoding: utf-8 -*-
import re
from typing import List
HREF_RGX1_PTTRN = r'href=\"([^"]*)\"'
HREF_RGX2_PTTRN = r"href=\'([^']*)\'"
SRC_RGX1_PTTRN = r'src=\"([^"]*)\"'
SRC_RGX2_PTTRN = r"src=\'([^']*)\'"
URL_RGX_PTTRN = r'(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?'
HREF_RGX1 = re.compile(HREF_RGX1_PTTRN)
HREF_RGX2 = re.compile(HREF_RGX2_PTTRN)
SRC_RGX1 = re.compile(SRC_RGX1_PTTRN)
SRC_RGX2 = re.compile(SRC_RGX2_PTTRN)
URL_RGX = re.compile(URL_RGX_PTTRN)
def search_urls(text: str) -> List[str]:
return list(set(
list(map(lambda l: l.group(0), URL_RGX.finditer(text))) +
list(map(lambda l: l.group(1), SRC_RGX1.finditer(text))) +
list(map(lambda l: l.group(1), SRC_RGX2.finditer(text))) +
list(map(lambda l: l.group(1), HREF_RGX1.finditer(text))) +
list(map(lambda l: l.group(1), HREF_RGX2.finditer(text))) +
[]
))