2018-01-19 22:44:43 +01:00
|
|
|
#!/usr/bin/env python3
|
2018-02-19 19:50:26 +01:00
|
|
|
#******************************************************************************
|
|
|
|
# Copyright (C) 2018 Thomas "Cakeisalie5" Touhey <thomas@touhey.fr>
|
|
|
|
# This file is part of the textoutpc project, which is MIT-licensed.
|
|
|
|
#******************************************************************************
|
2018-01-19 22:44:43 +01:00
|
|
|
""" Autolinking (URL extraction from raw text) in HTML. """
|
|
|
|
|
|
|
|
import regex as _re
|
|
|
|
|
2018-02-11 12:01:32 +01:00
|
|
|
__all__ = ["htmlurls", "lightscripturls"]
|
2018-01-19 22:44:43 +01:00
|
|
|
|
|
|
|
# ---
|
|
|
|
# Autolinking regex.
|
|
|
|
# ---
|
|
|
|
|
2018-02-11 12:01:32 +01:00
|
|
|
def _sub_html(m):
|
2018-01-19 22:44:43 +01:00
|
|
|
sp = m.group('sp')
|
|
|
|
url = m.group('url')
|
|
|
|
aft = ''
|
|
|
|
|
|
|
|
# Hack for the last comma.
|
|
|
|
if url[-1] == ',':
|
|
|
|
url, aft = url[:-1], ','
|
|
|
|
|
2018-02-11 21:31:39 +01:00
|
|
|
text = '{}<a href="{}">{}</a>{}' \
|
2018-01-19 22:44:43 +01:00
|
|
|
.format(sp, url, url, aft)
|
|
|
|
return text
|
|
|
|
|
2018-02-11 12:01:32 +01:00
|
|
|
def _sub_lightscript(m):
|
|
|
|
sp = m.group('sp')
|
|
|
|
url = m.group('url')
|
|
|
|
aft = ''
|
|
|
|
|
|
|
|
# Hack for the last comma.
|
|
|
|
if url[-1] == ',':
|
|
|
|
url, aft = url[:-1], ','
|
|
|
|
|
|
|
|
url = url.replace('<', '%3C')
|
|
|
|
url = url.replace('>', '%3E')
|
|
|
|
text = '{}<{}>{}'.format(sp, url, aft)
|
|
|
|
return text
|
|
|
|
|
2018-01-19 22:44:43 +01:00
|
|
|
_reg = _re.compile("""\
|
|
|
|
(?P<sp>^|\s|[[:punct:]])
|
|
|
|
(?P<url>(https?|ftp):
|
|
|
|
(?P<ucore>[^\[\]\(\)\s]* (\[(?&ucore)\]?)* (\((?&ucore)\)?)*)*
|
|
|
|
)
|
|
|
|
""", _re.VERBOSE | _re.M)
|
|
|
|
|
|
|
|
# ---
|
2018-02-11 12:01:32 +01:00
|
|
|
# Main functions.
|
2018-01-19 22:44:43 +01:00
|
|
|
# ---
|
|
|
|
|
|
|
|
def htmlurls(text):
|
2018-02-11 12:01:32 +01:00
|
|
|
return _reg.sub(_sub_html, text)
|
|
|
|
|
|
|
|
def lightscripturls(text):
|
|
|
|
return _reg.sub(_sub_lightscript, text)
|
2018-01-19 22:44:43 +01:00
|
|
|
|
|
|
|
# End of file.
|