Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Alternative Huffman Coding Implementation Using Priority Queue and Modular Design #12422

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
146 changes: 77 additions & 69 deletions compression/huffman.py
Original file line number Diff line number Diff line change
@@ -1,92 +1,100 @@
from __future__ import annotations

import heapq
from collections import defaultdict
import sys


class Letter:
def __init__(self, letter: str, freq: int):
self.letter: str = letter
self.freq: int = freq
self.bitstring: dict[str, str] = {}
class HuffmanNode:

Check failure on line 6 in compression/huffman.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (I001)

compression/huffman.py:1:1: I001 Import block is un-sorted or un-formatted
def __init__(self, char=None, freq=0):
self.char = char
self.freq = freq
self.left = None
self.right = None

def __repr__(self) -> str:
return f"{self.letter}:{self.freq}"
def __lt__(self, other):
return self.freq < other.freq


class TreeNode:
def __init__(self, freq: int, left: Letter | TreeNode, right: Letter | TreeNode):
self.freq: int = freq
self.left: Letter | TreeNode = left
self.right: Letter | TreeNode = right
def calculate_frequencies(file_path):
"""
Reads the file and calculates the frequency of each character.
"""
freq = defaultdict(int)
with open(file_path, "r") as file:

Check failure on line 22 in compression/huffman.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP015)

compression/huffman.py:22:10: UP015 Unnecessary open mode parameters
for line in file:
for char in line:
freq[char] += 1
return freq


def parse_file(file_path: str) -> list[Letter]:
def build_huffman_tree(freq_dict):
"""
Read the file and build a dict of all letters and their
frequencies, then convert the dict into a list of Letters.
Builds the Huffman tree using a priority queue.
"""
chars: dict[str, int] = {}
with open(file_path) as f:
while True:
c = f.read(1)
if not c:
break
chars[c] = chars[c] + 1 if c in chars else 1
return sorted((Letter(c, f) for c, f in chars.items()), key=lambda x: x.freq)
priority_queue = [HuffmanNode(char, freq) for char, freq in freq_dict.items()]
heapq.heapify(priority_queue)

while len(priority_queue) > 1:
left = heapq.heappop(priority_queue)
right = heapq.heappop(priority_queue)

merged = HuffmanNode(freq=left.freq + right.freq)
merged.left = left
merged.right = right

def build_tree(letters: list[Letter]) -> Letter | TreeNode:
heapq.heappush(priority_queue, merged)

return priority_queue[0]


def generate_codes(node, current_code="", code_map=None):
"""
Run through the list of Letters and build the min heap
for the Huffman Tree.
Generates the Huffman codes by traversing the tree recursively.
"""
response: list[Letter | TreeNode] = list(letters)
while len(response) > 1:
left = response.pop(0)
right = response.pop(0)
total_freq = left.freq + right.freq
node = TreeNode(total_freq, left, right)
response.append(node)
response.sort(key=lambda x: x.freq)
return response[0]


def traverse_tree(root: Letter | TreeNode, bitstring: str) -> list[Letter]:
if code_map is None:
code_map = {}

if node is not None:
if node.char is not None:
code_map[node.char] = current_code

generate_codes(node.left, current_code + "0", code_map)
generate_codes(node.right, current_code + "1", code_map)

return code_map


def encode_file(file_path, code_map):
"""
Recursively traverse the Huffman Tree to set each
Letter's bitstring dictionary, and return the list of Letters
Encodes the file contents using the Huffman codes.
"""
if isinstance(root, Letter):
root.bitstring[root.letter] = bitstring
return [root]
treenode: TreeNode = root
letters = []
letters += traverse_tree(treenode.left, bitstring + "0")
letters += traverse_tree(treenode.right, bitstring + "1")
return letters
encoded_output = []
with open(file_path, "r") as file:

Check failure on line 71 in compression/huffman.py

View workflow job for this annotation

GitHub Actions / ruff

Ruff (UP015)

compression/huffman.py:71:10: UP015 Unnecessary open mode parameters
for line in file:
for char in line:
encoded_output.append(code_map[char])

return "".join(encoded_output)

def huffman(file_path: str) -> None:

def huffman(file_path):
"""
Parse the file, build the tree, then run through the file
again, using the letters dictionary to find and print out the
bitstring for each letter.
Main function to perform Huffman encoding on a given file.
"""
letters_list = parse_file(file_path)
root = build_tree(letters_list)
letters = {
k: v for letter in traverse_tree(root, "") for k, v in letter.bitstring.items()
}
print(f"Huffman Coding of {file_path}: ")
with open(file_path) as f:
while True:
c = f.read(1)
if not c:
break
print(letters[c], end=" ")
print()
freq_dict = calculate_frequencies(file_path)
huffman_tree_root = build_huffman_tree(freq_dict)
code_map = generate_codes(huffman_tree_root)

print(f"Huffman Codes for characters in {file_path}:")
for char, code in code_map.items():
print(f"'{char}': {code}")

encoded_data = encode_file(file_path, code_map)
print("\nEncoded Data:")
print(encoded_data)


if __name__ == "__main__":
# pass the file path to the huffman function
huffman(sys.argv[1])
if len(sys.argv) < 2:
print("Usage: python huffman.py <file_path>")
else:
huffman(sys.argv[1])
Loading