OXIESEC PANEL

Name	Size	Modified	Perms
📁 ..	-	10/26/2024 01:28:40 PM	rwxr-xr-x
📄 __init__.py	3.62 KB	10/26/2024 01:28:31 PM	rw-r--r--
📁 __pycache__	-	10/26/2024 01:29:39 PM	rwxr-xr-x
📄 big5freq.py	30.54 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 big5prober.py	1.7 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 chardistribution.py	9.38 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 charsetgroupprober.py	3.73 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 charsetprober.py	4.69 KB	10/26/2024 01:28:30 PM	rw-r--r--
📁 cli	-	10/26/2024 01:30:12 PM	rwxr-xr-x
📄 codingstatemachine.py	3.48 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 cp949prober.py	1.79 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 enums.py	1.58 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 escprober.py	3.77 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 escsm.py	11.74 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 eucjpprober.py	3.59 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 euckrfreq.py	13.25 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 euckrprober.py	1.69 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 euctwfreq.py	36.05 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 euctwprober.py	1.69 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 gb2312freq.py	20.25 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 gb2312prober.py	1.7 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 hebrewprober.py	13.59 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 jisfreq.py	25.19 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 johabfreq.py	41.5 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 johabprober.py	1.69 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 jpcntx.py	26.17 KB	10/26/2024 01:28:30 PM	rw-r--r--
📄 langbulgarianmodel.py	102.11 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langgreekmodel.py	96.18 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langhebrewmodel.py	95.89 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langhungarianmodel.py	98.99 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langrussianmodel.py	125.03 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langthaimodel.py	100.37 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 langturkishmodel.py	93.14 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 latin1prober.py	5.14 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 mbcharsetprober.py	3.29 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 mbcsgroupprober.py	2.01 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 mbcssm.py	29.36 KB	10/26/2024 01:28:31 PM	rw-r--r--
📁 metadata	-	10/26/2024 01:30:12 PM	rwxr-xr-x
📄 sbcharsetprober.py	6.05 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 sbcsgroupprober.py	4.03 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 sjisprober.py	3.66 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 universaldetector.py	12.98 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 utf1632prober.py	8.09 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 utf8prober.py	2.65 KB	10/26/2024 01:28:31 PM	rw-r--r--
📄 version.py	242 bytes	10/26/2024 01:28:31 PM	rw-r--r--

Editing: utf1632prober.py

######################## BEGIN LICENSE BLOCK ########################
#
# Contributor(s):
#   Jason Zavaglia
#
# This library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# This library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
# 02110-1301  USA
######################### END LICENSE BLOCK #########################
from .charsetprober import CharSetProber
from .enums import ProbingState

class UTF1632Prober(CharSetProber):
    """
    This class simply looks for occurrences of zero bytes, and infers
    whether the file is UTF16 or UTF32 (low-endian or big-endian)
    For instance, files looking like ( \0 \0 \0 [nonzero] )+
    have a good probability to be UTF32BE.  Files looking like ( \0 [nonzero] )+
    may be guessed to be UTF16BE, and inversely for little-endian varieties.
    """

# how many logical characters to scan before feeling confident of prediction
    MIN_CHARS_FOR_DETECTION = 20
    # a fixed constant ratio of expected zeros or non-zeros in modulo-position.
    EXPECTED_RATIO = 0.94

def __init__(self):
        super().__init__()
        self.position = 0
        self.zeros_at_mod = [0] * 4
        self.nonzeros_at_mod = [0] * 4
        self._state = ProbingState.DETECTING
        self.quad = [0, 0, 0, 0]
        self.invalid_utf16be = False
        self.invalid_utf16le = False
        self.invalid_utf32be = False
        self.invalid_utf32le = False
        self.first_half_surrogate_pair_detected_16be = False
        self.first_half_surrogate_pair_detected_16le = False
        self.reset()

def reset(self):
        super().reset()
        self.position = 0
        self.zeros_at_mod = [0] * 4
        self.nonzeros_at_mod = [0] * 4
        self._state = ProbingState.DETECTING
        self.invalid_utf16be = False
        self.invalid_utf16le = False
        self.invalid_utf32be = False
        self.invalid_utf32le = False
        self.first_half_surrogate_pair_detected_16be = False
        self.first_half_surrogate_pair_detected_16le = False
        self.quad = [0, 0, 0, 0]

@property
    def charset_name(self):
        if self.is_likely_utf32be():
            return "utf-32be"
        if self.is_likely_utf32le():
            return "utf-32le"
        if self.is_likely_utf16be():
            return "utf-16be"
        if self.is_likely_utf16le():
            return "utf-16le"
        # default to something valid
        return "utf-16"

@property
    def language(self):
        return ""

def approx_32bit_chars(self):
        return max(1.0, self.position / 4.0)

def approx_16bit_chars(self):
        return max(1.0, self.position / 2.0)

def is_likely_utf32be(self):
        approx_chars = self.approx_32bit_chars()
        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
            self.zeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
            and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
            and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
            and self.nonzeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
            and not self.invalid_utf32be
        )

def is_likely_utf32le(self):
        approx_chars = self.approx_32bit_chars()
        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
            self.nonzeros_at_mod[0] / approx_chars > self.EXPECTED_RATIO
            and self.zeros_at_mod[1] / approx_chars > self.EXPECTED_RATIO
            and self.zeros_at_mod[2] / approx_chars > self.EXPECTED_RATIO
            and self.zeros_at_mod[3] / approx_chars > self.EXPECTED_RATIO
            and not self.invalid_utf32le
        )

def is_likely_utf16be(self):
        approx_chars = self.approx_16bit_chars()
        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
            (self.nonzeros_at_mod[1] + self.nonzeros_at_mod[3]) / approx_chars
            > self.EXPECTED_RATIO
            and (self.zeros_at_mod[0] + self.zeros_at_mod[2]) / approx_chars
            > self.EXPECTED_RATIO
            and not self.invalid_utf16be
        )

def is_likely_utf16le(self):
        approx_chars = self.approx_16bit_chars()
        return approx_chars >= self.MIN_CHARS_FOR_DETECTION and (
            (self.nonzeros_at_mod[0] + self.nonzeros_at_mod[2]) / approx_chars
            > self.EXPECTED_RATIO
            and (self.zeros_at_mod[1] + self.zeros_at_mod[3]) / approx_chars
            > self.EXPECTED_RATIO
            and not self.invalid_utf16le
        )

def validate_utf32_characters(self, quad):
        """
        Validate if the quad of bytes is valid UTF-32.

UTF-32 is valid in the range 0x00000000 - 0x0010FFFF
        excluding 0x0000D800 - 0x0000DFFF

https://en.wikipedia.org/wiki/UTF-32
        """
        if (
            quad[0] != 0
            or quad[1] > 0x10
            or (quad[0] == 0 and quad[1] == 0 and 0xD8 <= quad[2] <= 0xDF)
        ):
            self.invalid_utf32be = True
        if (
            quad[3] != 0
            or quad[2] > 0x10
            or (quad[3] == 0 and quad[2] == 0 and 0xD8 <= quad[1] <= 0xDF)
        ):
            self.invalid_utf32le = True

def validate_utf16_characters(self, pair):
        """
        Validate if the pair of bytes is  valid UTF-16.

UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF
        with an exception for surrogate pairs, which must be in the range
        0xD800-0xDBFF followed by 0xDC00-0xDFFF

https://en.wikipedia.org/wiki/UTF-16
        """
        if not self.first_half_surrogate_pair_detected_16be:
            if 0xD8 <= pair[0] <= 0xDB:
                self.first_half_surrogate_pair_detected_16be = True
            elif 0xDC <= pair[0] <= 0xDF:
                self.invalid_utf16be = True
        else:
            if 0xDC <= pair[0] <= 0xDF:
                self.first_half_surrogate_pair_detected_16be = False
            else:
                self.invalid_utf16be = True

if not self.first_half_surrogate_pair_detected_16le:
            if 0xD8 <= pair[1] <= 0xDB:
                self.first_half_surrogate_pair_detected_16le = True
            elif 0xDC <= pair[1] <= 0xDF:
                self.invalid_utf16le = True
        else:
            if 0xDC <= pair[1] <= 0xDF:
                self.first_half_surrogate_pair_detected_16le = False
            else:
                self.invalid_utf16le = True

def feed(self, byte_str):
        for c in byte_str:
            mod4 = self.position % 4
            self.quad[mod4] = c
            if mod4 == 3:
                self.validate_utf32_characters(self.quad)
                self.validate_utf16_characters(self.quad[0:2])
                self.validate_utf16_characters(self.quad[2:4])
            if c == 0:
                self.zeros_at_mod[mod4] += 1
            else:
                self.nonzeros_at_mod[mod4] += 1
            self.position += 1
        return self.state

@property
    def state(self):
        if self._state in {ProbingState.NOT_ME, ProbingState.FOUND_IT}:
            # terminal, decided states
            return self._state
        if self.get_confidence() > 0.80:
            self._state = ProbingState.FOUND_IT
        elif self.position > 4 * 1024:
            # if we get to 4kb into the file, and we can't conclude it's UTF,
            # let's give up
            self._state = ProbingState.NOT_ME
        return self._state

def get_confidence(self):
        return (
            0.85
            if (
                self.is_likely_utf16le()
                or self.is_likely_utf16be()
                or self.is_likely_utf32le()
                or self.is_likely_utf32be()
            )
            else 0.00
        )