WATFAG/src/watfag/parsers/generic/group.py

import regex as re

from watfag.parsers.generic import GenericParser
from watfag.parsers.generic.watfag import Group
from watfag.parsers.generic.parsers import DataParser

patterns = [
    re.compile(r"(?:-| - )(?P<group>[a-zA-Z0-9 &]*)\)?$", re.UNICODE),
    re.compile(r"\[(?P<group>[a-zA-Z0-9 &]*)\]?$", re.UNICODE),
    re.compile(r"(?: )\[?(?P<group>[a-zA-Z0-9]*?)]?\)?$", re.UNICODE)
]

invalid_group_substrs = [  # lowercase-only matches that can be any substring of a group name
    ' hevc ',  # Can appear at end of release name while not being a group
    ' x264 ',
    ' x265 ',
    ' truehd ',
    'bluray'
]
invalid_groups = [  # Case sensitive full group names that are invalid
    'MP4'
]

class GroupParser(DataParser, GenericParser):
    """Parses the SCENE release group from the unparsed text."""
    def parse(self) -> bool:
        for pattern in patterns:
            match = pattern.search(self.release.metadata_text)
            if match:
                if any(substr in match.group("group").lower() for substr in invalid_group_substrs):
                    continue  # Skip this match if it contains any invalid substrings

                if match.group("group").strip() in invalid_groups:
                    continue  # Skip this match if it is in the list of invalid group names

                if len(match.group("group").strip()) < 2:
                    continue  # Skip groups that are too short to be valid

                group = match.group("group").strip()
                self.release.group = Group.from_string(group)
                self.release.group_name = group

                self.release.metadata_text = self.release.metadata_text[:match.span()[0]] + self.release.metadata_text[match.span()[1]:]
                self.release.metadata_text = re.sub(r"\s+", " ", self.release.metadata_text).strip()  # Clean up extra spaces
                return True
        return False