use msgpack to compress raw entries

ilius · Jan 2, 2025 · 68693fd · 68693fd
1 parent e10bf9c
commit 68693fd
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 8 deletions.
diff --git a/pyglossary/entry.py b/pyglossary/entry.py
@@ -12,6 +12,8 @@
 )
 from typing import TYPE_CHECKING
 
+from msgpack import loads
+
 from .entry_base import BaseEntry, MultiStr
 from .iter_utils import unique_everseen
 from .text_utils import joinByBar
@@ -202,7 +204,7 @@ def getRawEntrySortKey(
 	) -> Callable[[RawEntryType], Any]:
 		def newKey(x: RawEntryType) -> Any:  # noqa: ANN401
 			# x is rawEntry, so x[2:] is list[bytes]: list of words in bytes
-			return key([b.decode("utf-8") for b in x[2:]])  # type: ignore
+			return key([b.decode("utf-8") for b in loads(x)[2:]])  # type: ignore
 
 		return newKey
 

diff --git a/pyglossary/glossary_types.py b/pyglossary/glossary_types.py
@@ -4,7 +4,6 @@
 from collections.abc import (
 	Callable,
 	Iterator,
-	Sequence,
 )
 
 # -*- coding: utf-8 -*-
@@ -33,7 +32,7 @@
 # str(rawEntry[0]): defiFormat or ""
 # rawEntry[1]: b_defi
 # rawEntry[2:]: b_word_list
-RawEntryType: TypeAlias = Sequence[bytes]
+RawEntryType: TypeAlias = bytes
 
 
 class EntryType(typing.Protocol):  # noqa: PLR0904

diff --git a/pyglossary/glossary_v2.py b/pyglossary/glossary_v2.py
@@ -34,6 +34,8 @@
 from typing import TYPE_CHECKING, cast
 from uuid import uuid1
 
+from msgpack import dumps, loads
+
 from . import core
 from .core import (
 	cacheDir,
@@ -243,7 +245,7 @@ def _dataEntryToRaw(self, entry: DataEntry) -> RawEntryType:
 		b_fpath = b""
 		if self.tmpDataDir:
 			b_fpath = entry.save(self.tmpDataDir).encode("utf-8")
-		return (b"b", b_fpath, entry.getFileName().encode("utf-8"))
+		return dumps([b"b", b_fpath, entry.getFileName().encode("utf-8")])
 
 	def _entryToRaw(self, entry: EntryType) -> RawEntryType:
 		"""
@@ -257,9 +259,10 @@ def _entryToRaw(self, entry: EntryType) -> RawEntryType:
 		if defiFormat is None or defiFormat == self._defaultDefiFormat:
 			defiFormat = ""
 
-		return [defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word
+		return dumps([defiFormat.encode("ascii"), entry.b_defi] + entry.lb_word)
 
-	def _entryFromRaw(self, rawEntry: RawEntryType) -> EntryType:
+	def _entryFromRaw(self, rawEntrBytes: RawEntryType) -> EntryType:
+		rawEntry = loads(rawEntrBytes)
 		defiFormat = rawEntry[0].decode("ascii") or self._defaultDefiFormat
 		defi = rawEntry[1].decode("utf-8")
 

diff --git a/pyglossary/sq_entry_list.py b/pyglossary/sq_entry_list.py
@@ -111,10 +111,10 @@ def __len__(self) -> int:
 		return self._len
 
 	def _encode(self, entry: EntryType) -> bytes:
-		return b"\x00".join(self._entryToRaw(entry))
+		return self._entryToRaw(entry)
 
 	def _decode(self, data: bytes) -> EntryType:
-		return self._entryFromRaw(data.split(b"\x00"))
+		return self._entryFromRaw(data)
 
 	def append(self, entry: EntryType) -> None:
 		self._cur.execute(  # type: ignore