Skip to content

Commit

Permalink
playwright ci changes
Browse files Browse the repository at this point in the history
  • Loading branch information
emcf committed Mar 24, 2024
1 parent 953dd1f commit 986a760
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 9 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/python-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ jobs:
with:
node-version: 18
- name: Install dependencies
run: npm ci
run: npm install
- name: Install Playwright Browsers
run: npx playwright install --with-deps
- name: Set up Python 3.10
Expand Down
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
__pycache__/
outputs/
logs/
logs/
node_modules/
8 changes: 1 addition & 7 deletions extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,31 +287,25 @@ def extract_docx(source_name: str) -> List[Chunk]:
# make new temp image directory
chunks = []
temp_image_dir = tempfile.mkdtemp()
print('processing')
text = docx2txt.process(source_name, temp_image_dir)
chunks.append(Chunk(path=source_name, text=text, image=None, source_type=SourceTypes.DOCX))
for image_name in os.listdir(temp_image_dir):
print(image_name)
image_path = os.path.join(temp_image_dir, image_name)
print('attempgin to open')
image = Image.open(image_path)
image.load() # needed to close the file
print("appending")
chunks.append(Chunk(path=source_name, text=None, image=image, source_type=SourceTypes.DOCX))
# if temp dir exists, remove images and it
print('attempting delete')
if os.path.exists(temp_image_dir):
for image_name in os.listdir(temp_image_dir):
image_path = os.path.join(temp_image_dir, image_name)
os.remove(image_path)
os.rmdir(temp_image_dir)
print('done')
return chunks

def extract_pptx(source_name: str) -> List[Chunk]:
prs = Presentation(source_name)
chunks = []
# parse slides, shapes, and images
# parse shapes inside slides
for slide in prs.slides:
slide_text = ""
slide_images = []
Expand Down
57 changes: 57 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

18 changes: 18 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"name": "thepipe",
"version": "1.0.0",
"description": "The Pipe is a simple tool to automate information extraction for multimodal LLMs.",
"main": "index.js",
"directories": {
"test": "tests"
},
"scripts": {
"test": "echo \"Error: no test specified\" && exit 1"
},
"keywords": [],
"author": "",
"license": "ISC",
"dependencies": {
"playwright": "^1.42.1"
}
}

0 comments on commit 986a760

Please sign in to comment.