Skip to content

Commit

Permalink
use msgpack to compress raw entries
Browse files Browse the repository at this point in the history
  • Loading branch information
ilius committed Jan 2, 2025
1 parent e10bf9c commit 68693fd
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 8 deletions.
4 changes: 3 additions & 1 deletion pyglossary/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
)
from typing import TYPE_CHECKING

from msgpack import loads

from .entry_base import BaseEntry, MultiStr
from .iter_utils import unique_everseen
from .text_utils import joinByBar
Expand Down Expand Up @@ -202,7 +204,7 @@ def getRawEntrySortKey(
) -> Callable[[RawEntryType], Any]:
def newKey(x: RawEntryType) -> Any: # noqa: ANN401
# x is rawEntry, so x[2:] is list[bytes]: list of words in bytes
return key([b.decode("utf-8") for b in x[2:]]) # type: ignore
return key([b.decode("utf-8") for b in loads(x)[2:]]) # type: ignore

return newKey

Expand Down
3 changes: 1 addition & 2 deletions pyglossary/glossary_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from collections.abc import (
Callable,
Iterator,
Sequence,
)

# -*- coding: utf-8 -*-
Expand Down Expand Up @@ -33,7 +32,7 @@
# str(rawEntry[0]): defiFormat or ""
# rawEntry[1]: b_defi
# rawEntry[2:]: b_word_list
RawEntryType: TypeAlias = Sequence[bytes]
RawEntryType: TypeAlias = bytes


class EntryType(typing.Protocol): # noqa: PLR0904
Expand Down
9 changes: 6 additions & 3 deletions pyglossary/glossary_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@
from typing import TYPE_CHECKING, cast
from uuid import uuid1

from msgpack import dumps, loads

from . import core
from .core import (
cacheDir,
Expand Down Expand Up @@ -243,7 +245,7 @@ def _dataEntryToRaw(self, entry: DataEntry) -> RawEntryType:
b_fpath = b""
if self.tmpDataDir:
b_fpath = entry.save(self.tmpDataDir).encode("utf-8")
return (b"b", b_fpath, entry.getFileName().encode("utf-8"))
return dumps([b"b", b_fpath, entry.getFileName().encode("utf-8")])

def _entryToRaw(self, entry: EntryType) -> RawEntryType:
"""
Expand All @@ -257,9 +259,10 @@ def _entryToRaw(self, entry: EntryType) -> RawEntryType:
if defiFormat is None or defiFormat == self._defaultDefiFormat:
defiFormat = ""

return [defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word
return dumps([defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word)

def _entryFromRaw(self, rawEntry: RawEntryType) -> EntryType:
def _entryFromRaw(self, rawEntrBytes: RawEntryType) -> EntryType:
rawEntry = loads(rawEntrBytes)
defiFormat = rawEntry[0].decode("ascii") or self._defaultDefiFormat
defi = rawEntry[1].decode("utf-8")

Expand Down
4 changes: 2 additions & 2 deletions pyglossary/sq_entry_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,10 +111,10 @@ def __len__(self) -> int:
return self._len

def _encode(self, entry: EntryType) -> bytes:
return b"\x00".join(self._entryToRaw(entry))
return self._entryToRaw(entry)

def _decode(self, data: bytes) -> EntryType:
return self._entryFromRaw(data.split(b"\x00"))
return self._entryFromRaw(data)

def append(self, entry: EntryType) -> None:
self._cur.execute( # type: ignore
Expand Down

0 comments on commit 68693fd

Please sign in to comment.