From 26c191b19c7be4d7a3a2b6b3c767d3041de07bfe Mon Sep 17 00:00:00 2001 From: tcely Date: Sat, 2 May 2026 09:41:21 -0400 Subject: [PATCH 1/3] feat: enhance `msg_from_str` to handle older formats Add support for parsing older syslog message formats. --- src_py/hat/syslog/encoder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src_py/hat/syslog/encoder.py b/src_py/hat/syslog/encoder.py index e04ce30..b9fc659 100644 --- a/src_py/hat/syslog/encoder.py +++ b/src_py/hat/syslog/encoder.py @@ -6,6 +6,7 @@ from hat import json from hat.syslog import common +from hat.syslog import older_formats def msg_to_str(msg: common.Msg) -> str: @@ -25,7 +26,10 @@ def msg_to_str(msg: common.Msg) -> str: def msg_from_str(msg_str: str) -> common.Msg: """Parse message string formatted according to RFC 5424""" - match = _msg_pattern.fullmatch(msg_str).groupdict() + match = _msg_pattern.fullmatch(msg_str) + if match is None: + return older_formats.msg_from_rfc3164_str(msg_str) + match = match.groupdict() prival = int(match['prival']) return common.Msg( facility=common.Facility(prival // 8), From ba90b78525f13299682b97bab6e4772be977f739 Mon Sep 17 00:00:00 2001 From: tcely Date: Wed, 13 May 2026 19:15:30 -0400 Subject: [PATCH 2/3] feat: add RFC 3164 parser for syslog messages Implement RFC 3164 log message parser with regex formats. --- src_py/hat/syslog/older_formats.py | 112 +++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 src_py/hat/syslog/older_formats.py diff --git a/src_py/hat/syslog/older_formats.py b/src_py/hat/syslog/older_formats.py new file mode 100644 index 0000000..b16b1b8 --- /dev/null +++ b/src_py/hat/syslog/older_formats.py @@ -0,0 +1,112 @@ +import re +import socket +from datetime import datetime +from hat.syslog import common + + +KNOWN_HOSTNAME = socket.gethostname() +RE_HOSTNAME = re.escape(KNOWN_HOSTNAME) + +VALID_MONTHS = ( + 'Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', + 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec', +) + +S = ' ' +EOL = r'\r?\n?' +PRI = '<(?P0|[1-9][0-9]?|1[0-8][0-9]|19[0-1])>' +MONTH = f'(?P{"|".join(map(re.escape, VALID_MONTHS))})' +DAY = f'(?P{S}[1-9]|[1-2][0-9]|3[0-1])' +HOUR = '(?P[0-1][0-9]|2[0-3])' +MINUTE = ':(?P[0-5][0-9])' +SECOND = ':(?P[0-5][0-9])' +HOST_STRICT = f'(?:{S}{RE_HOSTNAME}(?={S}))?' +NOT_HOST = f'(?!{RE_HOSTNAME})' +PID = r'(?:[\[](?P[0-9]+)[\]])' +TAG_PID = f'(?P.+?){PID}?:' +MSG_BODY = '(?P.+)' + +formats = [ + # Generic with optional hostname and PID: + # Begins with a partial (5-15) ctime() date string. + # Does not accept hostnames other than this one. + # Remote logs that do not include a hostname are accepted. + {'parts': ( + PRI, MONTH, S, DAY, S, HOUR, MINUTE, SECOND, + HOST_STRICT, S, NOT_HOST, TAG_PID, S, + MSG_BODY, EOL, + )}, + # Support for `gunicorn` logs: + # No date or hostname. + # Also, non-standard PID placement. + {'parts': ( + PRI, + '(?P.+?):', S, PID, S, + MSG_BODY, EOL, + )}, +] +for _dict in formats: + _dict['regex'] = re.compile(''.join(_dict['parts'])) + + +def msg_from_rfc3164_str(msg_str: str) -> common.Msg: + """RFC 3164 parser. Raises ValueError on any deviation.""" + now = datetime.now() + + for _dict in formats: + _format_ = _dict['regex'] + match_obj = _format_.fullmatch(msg_str) + if match_obj is not None: + break + + if match_obj is None: + raise ValueError(f'No formats matched: {msg_str.encode()!r}') + + m = match_obj.groupdict() + + if 'month' in m: + day_val = m['day'].replace(' ', '0') + time_str = f'{m["hour"]}:{m["minute"]}:{m["second"]}' + ts_str = f'{now.year} {m["month"]} {day_val} {time_str}' + + dt = datetime.strptime(ts_str, '%Y %b %d %H:%M:%S') + # The skew should be zero when logging from the same host. + if now < dt: + dt = dt.replace(year=now.year - 1) + else: + # The matched format did not include the date and time. + dt = now + + prival = int(str(m['prival']), 10) + procid = m.get('procid', None) + tag_str = m['app_name'] + tag_ends_with_brackets = ( + ']' == tag_str[-1] and + tag_str.rsplit('[')[-1][:-1] and + tag_str[-1] != tag_str.rsplit('[')[-1][:-1] + ) + if procid is None and tag_ends_with_brackets: + procid = tag_str.rsplit('[')[-1][:-1] + if procid is not None: + try: + _pid = int(str(procid), 10) + if 0 >= _pid: + raise ValueError('too low') + elif 4_194_304 < _pid: # read from /proc instead? + raise ValueError('too high') + except Exception as e: + raise ValueError(f'Invalid process ID: {e}') + + return common.Msg( + facility=common.Facility(prival // 8), + severity=common.Severity(prival % 8), + version=None, + timestamp=dt.timestamp(), + hostname=KNOWN_HOSTNAME, + app_name=m['app_name'], + procid=m.get('procid', None), + msgid=None, + data=None, + msg=m['msg'] + ) + From 94e319d71cd7785b340e9547f8b697ea7fa95650 Mon Sep 17 00:00:00 2001 From: tcely Date: Fri, 15 May 2026 18:23:43 -0400 Subject: [PATCH 3/3] fix: allow embedded newline in messages --- src_py/hat/syslog/older_formats.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src_py/hat/syslog/older_formats.py b/src_py/hat/syslog/older_formats.py index b16b1b8..24b5c86 100644 --- a/src_py/hat/syslog/older_formats.py +++ b/src_py/hat/syslog/older_formats.py @@ -24,7 +24,7 @@ NOT_HOST = f'(?!{RE_HOSTNAME})' PID = r'(?:[\[](?P[0-9]+)[\]])' TAG_PID = f'(?P.+?){PID}?:' -MSG_BODY = '(?P.+)' +MSG_BODY = '(?P(?s:.)+)' formats = [ # Generic with optional hostname and PID: