Module `jpfreq.jp_frequency_list`

Japanese Frequency List

This module contains the main class JapaneseFrequencyList, which will be the main thing you are going to be working with.

Expand source code

"""
.. include:: ../../documentation/jp_frequency_list.md
"""

from fugashi import Tagger
from typing import Callable
from os.path import isfile as file_exists

from .word_slot import WordSlot, get_unique_wordslots
from .text_info import TextInfo
from .kanji import all_kanji_in_string, Kanji
from .util import percent_of, in_range
from .word import Word, WordType


def word_validator_exclude_by_type(
    input_word: Word, excluded_word_types: list[WordType]
) -> bool:
    """
    Validates a word by excluding it if it is of a certain type lists in `excluded_word_types`.

    Parameters
    ----------
    input_word : UnidicNode
        The word to validate.
    excluded_word_types : list[str]
        A list of word types to exclude. Defaults to `EXCLUDED_WORD_TYPES`.

    Returns
    -------
    bool
        Whether the word is valid or not.
    """
    for word_type in input_word.types:
        if word_type in excluded_word_types:
            return False

    return True


class JapaneseFrequencyList:
    """
    A class for storing the frequency of words in a Japanese text.
    """

    _unique_words: dict[str, WordSlot]
    _unique_kanji: dict[str, Kanji]
    _word_count: int
    _tagger: Tagger
    _word_validator: Callable[[Word], bool]
    compare_surface: bool
    excluded_word_types: list[WordType] = [
        WordType.PARTICLE,
        WordType.AUXILIARY_VERB,
        WordType.SUPPLEMENTARY_SYMBOL,
        WordType.BLANK_SPACE,
        WordType.NUMERAL,
    ]

    def __init__(
        self,
        text_to_analyse: list = None,
        tagger_instance=None,
        compare_surface: bool = False,
        excluded_word_types: list[WordType] = None,
    ):
        self._unique_words = {}
        self._unique_kanji = {}
        self._word_count = 0
        self.compare_surface = compare_surface

        if excluded_word_types is not None:
            self.excluded_word_types = excluded_word_types

        self._tagger = tagger_instance
        if not self._tagger:
            self._tagger = Tagger("-Owakati")
        elif not isinstance(self._tagger, Tagger):
            raise TypeError(
                f"JapaneseFrequencyList: tagger_instance must be of type fugashi.Tagger, not {type(self._tagger)}"
            )

        if text_to_analyse is not None:
            self.process_texts(text_to_analyse)

    def __len__(self) -> int:
        """
        The number of unique words in the frequency list.
        Returns
        -------
        int
            The number of unique words in the frequency list.
        """
        return len(self.wordslots)

    def __repr__(self) -> str:  # pragma: no cover
        """
        A string representation of the frequency list in the form: 'JapaneseFrequencyList(text_info=TextInfo(...))'
        Returns
        -------

        """
        text_info = self.generate_text_info()
        return f"JapaneseFrequencyList(\ntext_info={text_info!r}\n)"

    def __contains__(self, word: str) -> bool:
        """
        Whether the representation of a word is in the frequency list.

        This will compare against surfaces only if `compare_surface` is True.
        Parameters
        ----------
        word : str
            The word to check for.

        Returns
        -------
        bool
            Whether the word is in the frequency list.
        """
        contains = word in self._unique_words

        if self.compare_surface and not contains:
            contains = any(word in word_slot for word_slot in self.wordslots)

        return contains

    def __getitem__(self, word: str) -> WordSlot:
        """
        Returns the WordSlot for the specified word.
        This will search using the word's representation, not its surface.
        This means that if you pass "これ", it will fail, as the representation is "此れ".
        Please see the function `get_representation` in this class to get the representation of a word.
        Parameters
        ----------
        word

        Returns
        -------

        """
        if word not in self._unique_words.keys():
            raise KeyError(f"Word '{word}' not found in frequency list")

        return self._unique_words[word]

    @property
    def wordslots(self) -> list[WordSlot]:
        """
        Returns a list of all the wordslots.
        Returns
        -------
        list[WordSlot]
            A list of all the wordslots.
        """
        return list(self._unique_words.values())

    @property
    def word_count(self) -> int:
        """
        Returns the number of words.
        Returns
        -------
        int
            The number of words.
        """
        return self._word_count

    @property
    def unique_words(self) -> int:
        """
        Returns the number of unique words.
        Returns
        -------
        int
            The number of unique words.
        """
        return len(self.wordslots)

    @property
    def unique_words_used_once(self) -> int:
        """
        Returns the number of unique words used once.
        Returns
        -------
        int
            The number of unique words used once.
        """
        return len(get_unique_wordslots(self.wordslots))

    @property
    def unique_words_all(self) -> tuple[int, int, float]:
        """
        Returns the number of unique words, the number of unique words used once,
        and the percentage of unique words used once.
        Returns
        -------
        tuple[int, int, float]
            unique_words, unique_words_used_once, unique_words_used_once_percentage
        """
        unique_words: int = self.unique_words
        unique_words_used_once: int = self.unique_words_used_once

        unique_word_percentage: float = percent_of(unique_words_used_once, unique_words)

        return unique_words, unique_words_used_once, unique_word_percentage

    @property
    def unique_kanji(self) -> int:
        """
        Returns the number of unique kanji.
        Returns
        -------
        int
            The number of unique kanji.
        """
        return len(self._unique_kanji)

    @property
    def unique_kanji_used_once(self) -> int:
        """
        Returns the number of unique kanji used once.
        Returns
        -------
        int
            The number of unique kanji used once.
        """
        return len(
            [kanji for kanji in self._unique_kanji.values() if kanji.frequency == 1]
        )

    @property
    def unique_kanji_all(self) -> tuple[int, int, float]:
        """
        Returns the number of unique kanji, the number of unique kanji used once,
        and the percentage of unique kanji used once.
        Returns
        -------
        tuple[int, int, float]
            unique_kanji, unique_kanji_used_once, unique_kanji_used_once_percentage
        """
        unique_kanji: int = self.unique_kanji
        unique_kanji_used_once: int = self.unique_kanji_used_once

        unique_kanji_percentage: float = percent_of(
            unique_kanji_used_once, unique_kanji
        )

        return unique_kanji, unique_kanji_used_once, unique_kanji_percentage

    def clear(self) -> None:
        """
        Clears the frequency list of all words and kanji, reverting it to its initial state.
        """
        self._word_count = 0

        self._unique_words.clear()
        self._unique_kanji.clear()

    def get_most_frequent(
        self, limit: int = 100, minimum: int = -1, maximum: int = -1
    ) -> list[WordSlot]:
        """
        Returns a list of the most frequent words in the text with the specified limit.
        If limit is -1, then all words are returned.
        Parameters
        ----------
        limit : int
            The number of words to return.
        minimum : int
            The minimum frequency of the words to return (inclusive). -1 means no minimum.
        maximum : int
            The maximum frequency of the words to return (inclusive). -1 means no maximum.
        Returns
        -------
        list[WordSlot]
            A list of the most frequent words in the text with the specified limit, sorted by frequency.
        """
        item_array: list[WordSlot] = sorted(
            self.wordslots, key=lambda x: x.frequency, reverse=True
        )

        if minimum != -1 or maximum != -1:
            item_array = [
                item
                for item in item_array
                if in_range(item.frequency, minimum, maximum)
            ]

        if limit == -1 or limit > len(item_array):
            return item_array

        return item_array[:limit]

    def generate_text_info(self) -> TextInfo:
        """
        Generates a TextInfo object from the frequency list.
        Returns
        -------
        TextInfo
            A TextInfo object containing information about the text.
        """
        (
            unique_words,
            unique_words_used_once,
            unique_word_percentage,
        ) = self.unique_words_all
        (
            unique_kanji,
            unique_kanji_used_once,
            unique_kanji_percentage,
        ) = self.unique_kanji_all

        return TextInfo(
            self.word_count,
            unique_words,
            unique_words_used_once,
            unique_word_percentage,
            unique_kanji,
            unique_kanji_used_once,
            unique_kanji_percentage,
        )

    def add_kanji(self, kanji: Kanji) -> None:
        """
        Adds a kanji to the frequency list.
        Parameters
        ----------
        kanji : Kanji
            The kanji to add.
        """
        if kanji.representation in self._unique_kanji:
            self._unique_kanji[kanji.representation].frequency += 1
            return

        self._unique_kanji[kanji.representation] = kanji

    def validate_word(self, word: Word) -> bool:
        """
        Validates a word, checking if it should be excluded or not.

        This currently checks if the word is in the list of excluded types and removes it if it is.
        Parameters
        ----------
        word : Word
            The word to validate.
        Returns
        -------
        bool
            Whether the word is valid or not.
        """
        return word_validator_exclude_by_type(word, self.excluded_word_types)

    def add_word(self, word: Word) -> None:
        """
        Adds a word to the frequency list.

        If the word is already in the list, then the frequency is increased by 1.
        Otherwise, the word is added to the list with a frequency of 1.

        Note: This method assumes the word is valid.

        Parameters
        ----------
        word : Word
            The word to add.
        """
        self._word_count += 1

        if word.representation in self._unique_words.keys():
            self._unique_words[word.representation].add_word(word)
            return

        # if there is no representation of this word then we must add one
        self._unique_words[word.representation] = WordSlot([word])

    def get_representation(self, word: str) -> str:
        """
        Returns the representation of a word.
        This is the word without any inflections.
        For example, the representation of "これ" is "此れ".
        The representation of "行った" is "行く".
        Parameters
        ----------
        word : str
            The word to get the representation of.
        Returns
        -------
        str
            The representation of the word.
        """
        processed_word = self.parse_line(word)[0]

        if len(processed_word) < 1:
            raise ValueError(f"Word '{word}' is not a valid word")

        return processed_word[0].representation

    def parse_line(self, line: str) -> tuple[list[Word], list[Kanji]]:
        """
        Parses a line of text into a list of Words and a list of Kanji.
        Backbone of all parsing.

        Parameters
        ----------
        line : str
            The line to parse.

        Returns
        -------
        tuple[list[Word], list[Kanji]]
            A tuple containing the list of Words and the list of Kanji.
        """
        words = self._tagger(line)

        return [Word.from_node(word) for word in words], all_kanji_in_string(line)

    def process_line(self, line_to_process: str) -> None:
        """
        Parses a line, adding the valid words and all kanji to the frequency list.
        All other processing functions boil down to this.

        Parameters
        ----------
        line_to_process : str
            The line to process.
        """
        line_to_process = line_to_process.replace("\n", "").strip()
        words, kanji = self.parse_line(line_to_process)

        [self.add_kanji(kanji) for kanji in kanji]
        [self.add_word(word) for word in words if self.validate_word(word)]

    def process_text(self, text_to_process: str) -> None:
        """
        Parses a string split by the newline character, adding the valid words to the frequency list.

        Parameters
        ----------
        text_to_process : str
            Text potentially containing multiple lines.
        """
        [self.process_line(line) for line in text_to_process.split("\n")]

    def process_texts(self, texts_to_process: list) -> None:
        """
        Parses a list of texts, adding the valid words to the frequency list.

        Parameters
        ----------
        texts_to_process : list
            A list of texts to process.
        """
        [self.process_text(text) for text in texts_to_process]

    def process_file(self, file_path: str) -> None:
        """
        Parses a file, adding the valid words to the frequency list.
        Parameters
        ----------
        file_path : str
            The path to the file to process.
        """
        if not file_exists(file_path):
            raise FileExistsError(
                f"process_file: File path passed doesn't exist ({file_path})"
            )

        with open(file_path, "r", encoding="utf-8") as fs:
            [self.process_line(line) for line in fs]

Functions

def word_validator_exclude_by_type(input_word: Word, excluded_word_types: list[WordType]) ‑> bool

Validates a word by excluding it if it is of a certain type lists in excluded_word_types.

Parameters

input_word : UnidicNode: The word to validate.
excluded_word_types : list[str]: A list of word types to exclude. Defaults to EXCLUDED_WORD_TYPES.

Returns

bool: Whether the word is valid or not.

Expand source code

def word_validator_exclude_by_type(
    input_word: Word, excluded_word_types: list[WordType]
) -> bool:
    """
    Validates a word by excluding it if it is of a certain type lists in `excluded_word_types`.

    Parameters
    ----------
    input_word : UnidicNode
        The word to validate.
    excluded_word_types : list[str]
        A list of word types to exclude. Defaults to `EXCLUDED_WORD_TYPES`.

    Returns
    -------
    bool
        Whether the word is valid or not.
    """
    for word_type in input_word.types:
        if word_type in excluded_word_types:
            return False

    return True

Classes

class JapaneseFrequencyList (text_to_analyse: list = None, tagger_instance=None, compare_surface: bool = False, excluded_word_types: list[WordType] = None)

A class for storing the frequency of words in a Japanese text.

Expand source code

class JapaneseFrequencyList:
    """
    A class for storing the frequency of words in a Japanese text.
    """

    _unique_words: dict[str, WordSlot]
    _unique_kanji: dict[str, Kanji]
    _word_count: int
    _tagger: Tagger
    _word_validator: Callable[[Word], bool]
    compare_surface: bool
    excluded_word_types: list[WordType] = [
        WordType.PARTICLE,
        WordType.AUXILIARY_VERB,
        WordType.SUPPLEMENTARY_SYMBOL,
        WordType.BLANK_SPACE,
        WordType.NUMERAL,
    ]

    def __init__(
        self,
        text_to_analyse: list = None,
        tagger_instance=None,
        compare_surface: bool = False,
        excluded_word_types: list[WordType] = None,
    ):
        self._unique_words = {}
        self._unique_kanji = {}
        self._word_count = 0
        self.compare_surface = compare_surface

        if excluded_word_types is not None:
            self.excluded_word_types = excluded_word_types

        self._tagger = tagger_instance
        if not self._tagger:
            self._tagger = Tagger("-Owakati")
        elif not isinstance(self._tagger, Tagger):
            raise TypeError(
                f"JapaneseFrequencyList: tagger_instance must be of type fugashi.Tagger, not {type(self._tagger)}"
            )

        if text_to_analyse is not None:
            self.process_texts(text_to_analyse)

    def __len__(self) -> int:
        """
        The number of unique words in the frequency list.
        Returns
        -------
        int
            The number of unique words in the frequency list.
        """
        return len(self.wordslots)

    def __repr__(self) -> str:  # pragma: no cover
        """
        A string representation of the frequency list in the form: 'JapaneseFrequencyList(text_info=TextInfo(...))'
        Returns
        -------

        """
        text_info = self.generate_text_info()
        return f"JapaneseFrequencyList(\ntext_info={text_info!r}\n)"

    def __contains__(self, word: str) -> bool:
        """
        Whether the representation of a word is in the frequency list.

        This will compare against surfaces only if `compare_surface` is True.
        Parameters
        ----------
        word : str
            The word to check for.

        Returns
        -------
        bool
            Whether the word is in the frequency list.
        """
        contains = word in self._unique_words

        if self.compare_surface and not contains:
            contains = any(word in word_slot for word_slot in self.wordslots)

        return contains

    def __getitem__(self, word: str) -> WordSlot:
        """
        Returns the WordSlot for the specified word.
        This will search using the word's representation, not its surface.
        This means that if you pass "これ", it will fail, as the representation is "此れ".
        Please see the function `get_representation` in this class to get the representation of a word.
        Parameters
        ----------
        word

        Returns
        -------

        """
        if word not in self._unique_words.keys():
            raise KeyError(f"Word '{word}' not found in frequency list")

        return self._unique_words[word]

    @property
    def wordslots(self) -> list[WordSlot]:
        """
        Returns a list of all the wordslots.
        Returns
        -------
        list[WordSlot]
            A list of all the wordslots.
        """
        return list(self._unique_words.values())

    @property
    def word_count(self) -> int:
        """
        Returns the number of words.
        Returns
        -------
        int
            The number of words.
        """
        return self._word_count

    @property
    def unique_words(self) -> int:
        """
        Returns the number of unique words.
        Returns
        -------
        int
            The number of unique words.
        """
        return len(self.wordslots)

    @property
    def unique_words_used_once(self) -> int:
        """
        Returns the number of unique words used once.
        Returns
        -------
        int
            The number of unique words used once.
        """
        return len(get_unique_wordslots(self.wordslots))

    @property
    def unique_words_all(self) -> tuple[int, int, float]:
        """
        Returns the number of unique words, the number of unique words used once,
        and the percentage of unique words used once.
        Returns
        -------
        tuple[int, int, float]
            unique_words, unique_words_used_once, unique_words_used_once_percentage
        """
        unique_words: int = self.unique_words
        unique_words_used_once: int = self.unique_words_used_once

        unique_word_percentage: float = percent_of(unique_words_used_once, unique_words)

        return unique_words, unique_words_used_once, unique_word_percentage

    @property
    def unique_kanji(self) -> int:
        """
        Returns the number of unique kanji.
        Returns
        -------
        int
            The number of unique kanji.
        """
        return len(self._unique_kanji)

    @property
    def unique_kanji_used_once(self) -> int:
        """
        Returns the number of unique kanji used once.
        Returns
        -------
        int
            The number of unique kanji used once.
        """
        return len(
            [kanji for kanji in self._unique_kanji.values() if kanji.frequency == 1]
        )

    @property
    def unique_kanji_all(self) -> tuple[int, int, float]:
        """
        Returns the number of unique kanji, the number of unique kanji used once,
        and the percentage of unique kanji used once.
        Returns
        -------
        tuple[int, int, float]
            unique_kanji, unique_kanji_used_once, unique_kanji_used_once_percentage
        """
        unique_kanji: int = self.unique_kanji
        unique_kanji_used_once: int = self.unique_kanji_used_once

        unique_kanji_percentage: float = percent_of(
            unique_kanji_used_once, unique_kanji
        )

        return unique_kanji, unique_kanji_used_once, unique_kanji_percentage

    def clear(self) -> None:
        """
        Clears the frequency list of all words and kanji, reverting it to its initial state.
        """
        self._word_count = 0

        self._unique_words.clear()
        self._unique_kanji.clear()

    def get_most_frequent(
        self, limit: int = 100, minimum: int = -1, maximum: int = -1
    ) -> list[WordSlot]:
        """
        Returns a list of the most frequent words in the text with the specified limit.
        If limit is -1, then all words are returned.
        Parameters
        ----------
        limit : int
            The number of words to return.
        minimum : int
            The minimum frequency of the words to return (inclusive). -1 means no minimum.
        maximum : int
            The maximum frequency of the words to return (inclusive). -1 means no maximum.
        Returns
        -------
        list[WordSlot]
            A list of the most frequent words in the text with the specified limit, sorted by frequency.
        """
        item_array: list[WordSlot] = sorted(
            self.wordslots, key=lambda x: x.frequency, reverse=True
        )

        if minimum != -1 or maximum != -1:
            item_array = [
                item
                for item in item_array
                if in_range(item.frequency, minimum, maximum)
            ]

        if limit == -1 or limit > len(item_array):
            return item_array

        return item_array[:limit]

    def generate_text_info(self) -> TextInfo:
        """
        Generates a TextInfo object from the frequency list.
        Returns
        -------
        TextInfo
            A TextInfo object containing information about the text.
        """
        (
            unique_words,
            unique_words_used_once,
            unique_word_percentage,
        ) = self.unique_words_all
        (
            unique_kanji,
            unique_kanji_used_once,
            unique_kanji_percentage,
        ) = self.unique_kanji_all

        return TextInfo(
            self.word_count,
            unique_words,
            unique_words_used_once,
            unique_word_percentage,
            unique_kanji,
            unique_kanji_used_once,
            unique_kanji_percentage,
        )

    def add_kanji(self, kanji: Kanji) -> None:
        """
        Adds a kanji to the frequency list.
        Parameters
        ----------
        kanji : Kanji
            The kanji to add.
        """
        if kanji.representation in self._unique_kanji:
            self._unique_kanji[kanji.representation].frequency += 1
            return

        self._unique_kanji[kanji.representation] = kanji

    def validate_word(self, word: Word) -> bool:
        """
        Validates a word, checking if it should be excluded or not.

        This currently checks if the word is in the list of excluded types and removes it if it is.
        Parameters
        ----------
        word : Word
            The word to validate.
        Returns
        -------
        bool
            Whether the word is valid or not.
        """
        return word_validator_exclude_by_type(word, self.excluded_word_types)

    def add_word(self, word: Word) -> None:
        """
        Adds a word to the frequency list.

        If the word is already in the list, then the frequency is increased by 1.
        Otherwise, the word is added to the list with a frequency of 1.

        Note: This method assumes the word is valid.

        Parameters
        ----------
        word : Word
            The word to add.
        """
        self._word_count += 1

        if word.representation in self._unique_words.keys():
            self._unique_words[word.representation].add_word(word)
            return

        # if there is no representation of this word then we must add one
        self._unique_words[word.representation] = WordSlot([word])

    def get_representation(self, word: str) -> str:
        """
        Returns the representation of a word.
        This is the word without any inflections.
        For example, the representation of "これ" is "此れ".
        The representation of "行った" is "行く".
        Parameters
        ----------
        word : str
            The word to get the representation of.
        Returns
        -------
        str
            The representation of the word.
        """
        processed_word = self.parse_line(word)[0]

        if len(processed_word) < 1:
            raise ValueError(f"Word '{word}' is not a valid word")

        return processed_word[0].representation

    def parse_line(self, line: str) -> tuple[list[Word], list[Kanji]]:
        """
        Parses a line of text into a list of Words and a list of Kanji.
        Backbone of all parsing.

        Parameters
        ----------
        line : str
            The line to parse.

        Returns
        -------
        tuple[list[Word], list[Kanji]]
            A tuple containing the list of Words and the list of Kanji.
        """
        words = self._tagger(line)

        return [Word.from_node(word) for word in words], all_kanji_in_string(line)

    def process_line(self, line_to_process: str) -> None:
        """
        Parses a line, adding the valid words and all kanji to the frequency list.
        All other processing functions boil down to this.

        Parameters
        ----------
        line_to_process : str
            The line to process.
        """
        line_to_process = line_to_process.replace("\n", "").strip()
        words, kanji = self.parse_line(line_to_process)

        [self.add_kanji(kanji) for kanji in kanji]
        [self.add_word(word) for word in words if self.validate_word(word)]

    def process_text(self, text_to_process: str) -> None:
        """
        Parses a string split by the newline character, adding the valid words to the frequency list.

        Parameters
        ----------
        text_to_process : str
            Text potentially containing multiple lines.
        """
        [self.process_line(line) for line in text_to_process.split("\n")]

    def process_texts(self, texts_to_process: list) -> None:
        """
        Parses a list of texts, adding the valid words to the frequency list.

        Parameters
        ----------
        texts_to_process : list
            A list of texts to process.
        """
        [self.process_text(text) for text in texts_to_process]

    def process_file(self, file_path: str) -> None:
        """
        Parses a file, adding the valid words to the frequency list.
        Parameters
        ----------
        file_path : str
            The path to the file to process.
        """
        if not file_exists(file_path):
            raise FileExistsError(
                f"process_file: File path passed doesn't exist ({file_path})"
            )

        with open(file_path, "r", encoding="utf-8") as fs:
            [self.process_line(line) for line in fs]

Class variables

var compare_surface : bool
var excluded_word_types : list[WordType]

Instance variables

var unique_kanji : int

Returns the number of unique kanji. Returns

int: The number of unique kanji.

Expand source code

@property
def unique_kanji(self) -> int:
    """
    Returns the number of unique kanji.
    Returns
    -------
    int
        The number of unique kanji.
    """
    return len(self._unique_kanji)

var unique_kanji_all : tuple[int, int, float]

Returns the number of unique kanji, the number of unique kanji used once, and the percentage of unique kanji used once. Returns

tuple[int, int, float]: unique_kanji, unique_kanji_used_once, unique_kanji_used_once_percentage

Expand source code

@property
def unique_kanji_all(self) -> tuple[int, int, float]:
    """
    Returns the number of unique kanji, the number of unique kanji used once,
    and the percentage of unique kanji used once.
    Returns
    -------
    tuple[int, int, float]
        unique_kanji, unique_kanji_used_once, unique_kanji_used_once_percentage
    """
    unique_kanji: int = self.unique_kanji
    unique_kanji_used_once: int = self.unique_kanji_used_once

    unique_kanji_percentage: float = percent_of(
        unique_kanji_used_once, unique_kanji
    )

    return unique_kanji, unique_kanji_used_once, unique_kanji_percentage

var unique_kanji_used_once : int

Returns the number of unique kanji used once. Returns

int: The number of unique kanji used once.

Expand source code

@property
def unique_kanji_used_once(self) -> int:
    """
    Returns the number of unique kanji used once.
    Returns
    -------
    int
        The number of unique kanji used once.
    """
    return len(
        [kanji for kanji in self._unique_kanji.values() if kanji.frequency == 1]
    )

var unique_words : int

Returns the number of unique words. Returns

int: The number of unique words.

Expand source code

@property
def unique_words(self) -> int:
    """
    Returns the number of unique words.
    Returns
    -------
    int
        The number of unique words.
    """
    return len(self.wordslots)

var unique_words_all : tuple[int, int, float]

Returns the number of unique words, the number of unique words used once, and the percentage of unique words used once. Returns

tuple[int, int, float]: unique_words, unique_words_used_once, unique_words_used_once_percentage

Expand source code

@property
def unique_words_all(self) -> tuple[int, int, float]:
    """
    Returns the number of unique words, the number of unique words used once,
    and the percentage of unique words used once.
    Returns
    -------
    tuple[int, int, float]
        unique_words, unique_words_used_once, unique_words_used_once_percentage
    """
    unique_words: int = self.unique_words
    unique_words_used_once: int = self.unique_words_used_once

    unique_word_percentage: float = percent_of(unique_words_used_once, unique_words)

    return unique_words, unique_words_used_once, unique_word_percentage

var unique_words_used_once : int

Returns the number of unique words used once. Returns

int: The number of unique words used once.

Expand source code

@property
def unique_words_used_once(self) -> int:
    """
    Returns the number of unique words used once.
    Returns
    -------
    int
        The number of unique words used once.
    """
    return len(get_unique_wordslots(self.wordslots))

var word_count : int

Returns the number of words. Returns

int: The number of words.

Expand source code

@property
def word_count(self) -> int:
    """
    Returns the number of words.
    Returns
    -------
    int
        The number of words.
    """
    return self._word_count

var wordslots : list[WordSlot]

Returns a list of all the wordslots. Returns

list[WordSlot]: A list of all the wordslots.

Expand source code

@property
def wordslots(self) -> list[WordSlot]:
    """
    Returns a list of all the wordslots.
    Returns
    -------
    list[WordSlot]
        A list of all the wordslots.
    """
    return list(self._unique_words.values())

Methods

def add_kanji(self, kanji: Kanji) ‑> None

Adds a kanji to the frequency list. Parameters

kanji : Kanji: The kanji to add.

Expand source code

def add_kanji(self, kanji: Kanji) -> None:
    """
    Adds a kanji to the frequency list.
    Parameters
    ----------
    kanji : Kanji
        The kanji to add.
    """
    if kanji.representation in self._unique_kanji:
        self._unique_kanji[kanji.representation].frequency += 1
        return

    self._unique_kanji[kanji.representation] = kanji

def add_word(self, word: Word) ‑> None

Adds a word to the frequency list.

If the word is already in the list, then the frequency is increased by 1. Otherwise, the word is added to the list with a frequency of 1.

Note: This method assumes the word is valid.

Parameters

word : Word: The word to add.

Expand source code

def add_word(self, word: Word) -> None:
    """
    Adds a word to the frequency list.

    If the word is already in the list, then the frequency is increased by 1.
    Otherwise, the word is added to the list with a frequency of 1.

    Note: This method assumes the word is valid.

    Parameters
    ----------
    word : Word
        The word to add.
    """
    self._word_count += 1

    if word.representation in self._unique_words.keys():
        self._unique_words[word.representation].add_word(word)
        return

    # if there is no representation of this word then we must add one
    self._unique_words[word.representation] = WordSlot([word])

def clear(self) ‑> None

Clears the frequency list of all words and kanji, reverting it to its initial state.

Expand source code

def clear(self) -> None:
    """
    Clears the frequency list of all words and kanji, reverting it to its initial state.
    """
    self._word_count = 0

    self._unique_words.clear()
    self._unique_kanji.clear()

def generate_text_info(self) ‑> TextInfo

Generates a TextInfo object from the frequency list. Returns

TextInfo: A TextInfo object containing information about the text.

Expand source code

def generate_text_info(self) -> TextInfo:
    """
    Generates a TextInfo object from the frequency list.
    Returns
    -------
    TextInfo
        A TextInfo object containing information about the text.
    """
    (
        unique_words,
        unique_words_used_once,
        unique_word_percentage,
    ) = self.unique_words_all
    (
        unique_kanji,
        unique_kanji_used_once,
        unique_kanji_percentage,
    ) = self.unique_kanji_all

    return TextInfo(
        self.word_count,
        unique_words,
        unique_words_used_once,
        unique_word_percentage,
        unique_kanji,
        unique_kanji_used_once,
        unique_kanji_percentage,
    )

def get_most_frequent(self, limit: int = 100, minimum: int = -1, maximum: int = -1) ‑> list[WordSlot]

Returns a list of the most frequent words in the text with the specified limit. If limit is -1, then all words are returned. Parameters

limit : int: The number of words to return.
minimum : int: The minimum frequency of the words to return (inclusive). -1 means no minimum.
maximum : int: The maximum frequency of the words to return (inclusive). -1 means no maximum.

Returns

list[WordSlot]: A list of the most frequent words in the text with the specified limit, sorted by frequency.

Expand source code

def get_most_frequent(
    self, limit: int = 100, minimum: int = -1, maximum: int = -1
) -> list[WordSlot]:
    """
    Returns a list of the most frequent words in the text with the specified limit.
    If limit is -1, then all words are returned.
    Parameters
    ----------
    limit : int
        The number of words to return.
    minimum : int
        The minimum frequency of the words to return (inclusive). -1 means no minimum.
    maximum : int
        The maximum frequency of the words to return (inclusive). -1 means no maximum.
    Returns
    -------
    list[WordSlot]
        A list of the most frequent words in the text with the specified limit, sorted by frequency.
    """
    item_array: list[WordSlot] = sorted(
        self.wordslots, key=lambda x: x.frequency, reverse=True
    )

    if minimum != -1 or maximum != -1:
        item_array = [
            item
            for item in item_array
            if in_range(item.frequency, minimum, maximum)
        ]

    if limit == -1 or limit > len(item_array):
        return item_array

    return item_array[:limit]

def get_representation(self, word: str) ‑> str

Returns the representation of a word. This is the word without any inflections. For example, the representation of "これ" is "此れ". The representation of "行った" is "行く". Parameters

word : str: The word to get the representation of.

Returns

str: The representation of the word.

Expand source code

def get_representation(self, word: str) -> str:
    """
    Returns the representation of a word.
    This is the word without any inflections.
    For example, the representation of "これ" is "此れ".
    The representation of "行った" is "行く".
    Parameters
    ----------
    word : str
        The word to get the representation of.
    Returns
    -------
    str
        The representation of the word.
    """
    processed_word = self.parse_line(word)[0]

    if len(processed_word) < 1:
        raise ValueError(f"Word '{word}' is not a valid word")

    return processed_word[0].representation

def parse_line(self, line: str) ‑> tuple[list[Word], list[Kanji]]

Parses a line of text into a list of Words and a list of Kanji. Backbone of all parsing.

Parameters

line : str: The line to parse.

Returns

tuple[list[Word], list[Kanji]]: A tuple containing the list of Words and the list of Kanji.

Expand source code

def parse_line(self, line: str) -> tuple[list[Word], list[Kanji]]:
    """
    Parses a line of text into a list of Words and a list of Kanji.
    Backbone of all parsing.

    Parameters
    ----------
    line : str
        The line to parse.

    Returns
    -------
    tuple[list[Word], list[Kanji]]
        A tuple containing the list of Words and the list of Kanji.
    """
    words = self._tagger(line)

    return [Word.from_node(word) for word in words], all_kanji_in_string(line)

def process_file(self, file_path: str) ‑> None

Parses a file, adding the valid words to the frequency list. Parameters

file_path : str: The path to the file to process.

Expand source code

def process_file(self, file_path: str) -> None:
    """
    Parses a file, adding the valid words to the frequency list.
    Parameters
    ----------
    file_path : str
        The path to the file to process.
    """
    if not file_exists(file_path):
        raise FileExistsError(
            f"process_file: File path passed doesn't exist ({file_path})"
        )

    with open(file_path, "r", encoding="utf-8") as fs:
        [self.process_line(line) for line in fs]

def process_line(self, line_to_process: str) ‑> None

Parses a line, adding the valid words and all kanji to the frequency list. All other processing functions boil down to this.

Parameters

line_to_process : str: The line to process.

Expand source code

def process_line(self, line_to_process: str) -> None:
    """
    Parses a line, adding the valid words and all kanji to the frequency list.
    All other processing functions boil down to this.

    Parameters
    ----------
    line_to_process : str
        The line to process.
    """
    line_to_process = line_to_process.replace("\n", "").strip()
    words, kanji = self.parse_line(line_to_process)

    [self.add_kanji(kanji) for kanji in kanji]
    [self.add_word(word) for word in words if self.validate_word(word)]

def process_text(self, text_to_process: str) ‑> None

Parses a string split by the newline character, adding the valid words to the frequency list.

Parameters

text_to_process : str: Text potentially containing multiple lines.

Expand source code

def process_text(self, text_to_process: str) -> None:
    """
    Parses a string split by the newline character, adding the valid words to the frequency list.

    Parameters
    ----------
    text_to_process : str
        Text potentially containing multiple lines.
    """
    [self.process_line(line) for line in text_to_process.split("\n")]

def process_texts(self, texts_to_process: list) ‑> None

Parses a list of texts, adding the valid words to the frequency list.

Parameters

texts_to_process : list: A list of texts to process.

Expand source code

def process_texts(self, texts_to_process: list) -> None:
    """
    Parses a list of texts, adding the valid words to the frequency list.

    Parameters
    ----------
    texts_to_process : list
        A list of texts to process.
    """
    [self.process_text(text) for text in texts_to_process]

def validate_word(self, word: Word) ‑> bool

Validates a word, checking if it should be excluded or not.

This currently checks if the word is in the list of excluded types and removes it if it is. Parameters

word : Word: The word to validate.

Returns

bool: Whether the word is valid or not.

Expand source code

def validate_word(self, word: Word) -> bool:
    """
    Validates a word, checking if it should be excluded or not.

    This currently checks if the word is in the list of excluded types and removes it if it is.
    Parameters
    ----------
    word : Word
        The word to validate.
    Returns
    -------
    bool
        Whether the word is valid or not.
    """
    return word_validator_exclude_by_type(word, self.excluded_word_types)