Thanks to visit codestin.com
Credit goes to github.com

Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,10 @@ convert(
)
```

The `ConversionConfig` class accepts the same parameters as the command line arguments:
The ConversionConfig class parameters:

- `pptx_path`: Path to the input PPTX file (required)
- `pptx`: A file-like object containing the PPTX data (required if `pptx_path` is not provided; not available via command line)
- `pptx_path`: Path to the input PPTX file (required if `pptx` is not provided or if using the command line)
- `output_path`: Path for the output markdown file (required)
- `image_dir`: Directory for extracted images (required)
- `title_path`: Path to custom titles file
Expand All @@ -165,7 +166,7 @@ The `ConversionConfig` class accepts the same parameters as the command line arg
- `page`: Convert only specified page number
- `keep_similar_titles`: Keep similar titles with "(cont.)" suffix


Note: Provide either `pptx_path` or `pptx`, not both. If both are provided, `pptx_path` takes precedence.

## Detailed Parse Rules

Expand Down
2 changes: 1 addition & 1 deletion pptx2md/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ def convert(config: ConversionConfig):
if config.title_path:
config.custom_titles = prepare_titles(config.title_path)

prs = load_pptx(config.pptx_path)
prs = load_pptx(config)

logger.info("conversion started")

Expand Down
7 changes: 5 additions & 2 deletions pptx2md/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,8 +150,11 @@ def process_picture(config: ConversionConfig, shape, slide_idx) -> Union[ImageEl

global picture_count

file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1])
pic_name = file_prefix + f'_{picture_count}'
if config.pptx_path is None:
pic_name = f'img_{picture_count}'
else:
file_prefix = ''.join(os.path.basename(config.pptx_path).split('.')[:-1])
pic_name = file_prefix + f'_{picture_count}'
pic_ext = shape.image.ext
if not os.path.exists(config.image_dir):
os.makedirs(config.image_dir)
Expand Down
31 changes: 28 additions & 3 deletions pptx2md/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,17 +14,27 @@

from __future__ import annotations

import logging
from enum import Enum
from pathlib import Path
from io import BytesIO, BufferedReader
from typing import List, Optional, Union

from pydantic import BaseModel
from pydantic import BaseModel, model_validator, ConfigDict

logger = logging.getLogger(__name__)


FileLikeType = Union[BytesIO, BufferedReader]


class ConversionConfig(BaseModel):
"""Configuration for PowerPoint to Markdown conversion."""

pptx_path: Path
pptx: Optional[FileLikeType] = None
"""File-like object of the pptx file to be converted"""

pptx_path: Optional[Path] = None
"""Path to the pptx file to be converted"""

output_path: Path
Expand Down Expand Up @@ -81,6 +91,19 @@ class ConversionConfig(BaseModel):
keep_similar_titles: bool = False
"""Keep similar titles (allow for repeated slide titles - One or more - Add (cont.) to the title)"""

@model_validator(mode="after")
def check_pptx_input(self):
if self.pptx is None and self.pptx_path is None:
raise ValueError("One of 'pptx' or 'pptx_path' must be supplied.")
elif self.pptx is not None and self.pptx_path is not None:
logger.warning(
"Both 'pptx' and 'pptx_path' are supplied. Using 'pptx_path' as the input file."
)
self.pptx = None
return self

model_config = ConfigDict(arbitrary_types_allowed=True)


class ElementType(str, Enum):
Title = "Title"
Expand Down Expand Up @@ -145,7 +168,9 @@ class TableElement(BaseElement):
content: List[List[List[TextRun]]] # rows -> cols -> rich text


SlideElement = Union[TitleElement, ListItemElement, ParagraphElement, ImageElement, TableElement]
SlideElement = Union[
TitleElement, ListItemElement, ParagraphElement, ImageElement, TableElement
]


class SlideType(str, Enum):
Expand Down
66 changes: 47 additions & 19 deletions pptx2md/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,68 +22,96 @@

from pptx import Presentation

from pptx2md.types import ConversionConfig, FileLikeType

logger = logging.getLogger(__name__)


def fix_null_rels(file_path):
temp_dir_name = tempfile.mkdtemp()
shutil.unpack_archive(file_path, temp_dir_name, 'zip')
shutil.unpack_archive(file_path, temp_dir_name, "zip")
rels = [
os.path.join(dp, f)
for dp, dn, filenames in os.walk(temp_dir_name)
for f in filenames
if os.path.splitext(f)[1] == '.rels'
if os.path.splitext(f)[1] == ".rels"
]
pat = re.compile(r'<\S*Relationship[^>]+Target\S*=\S*"NULL"[^>]*/>', re.I)
for fn in rels:
f = open(fn, 'r+')
f = open(fn, "r+")
content = f.read()
res = pat.search(content)
if res is not None:
content = pat.sub('', content)
content = pat.sub("", content)
f.seek(0)
f.truncate()
f.write(content)
f.close()
tfn = uuid.uuid4().hex
shutil.make_archive(tfn, 'zip', temp_dir_name)
shutil.make_archive(tfn, "zip", temp_dir_name)
shutil.rmtree(temp_dir_name)
tgt = f'{file_path[:-5]}_purged.pptx'
shutil.move(f'{tfn}.zip', tgt)
tgt = f"{file_path[:-5]}_purged.pptx"
shutil.move(f"{tfn}.zip", tgt)
return tgt


def load_pptx(file_path: str) -> Presentation:
def load_pptx_from_io(file_like: FileLikeType) -> Presentation:
"""Load a PowerPoint presentation from a file-like object."""
try:
file_like.seek(0)
prs = Presentation(file_like)
except Exception as err:
raise ValueError(
"Invalid file-like object. Please provide a valid PPTX file."
) from err
return prs


def load_pptx_from_path(file_path: Path) -> Presentation:
"""Load a PowerPoint presentation from a file path."""
if not os.path.exists(file_path):
logger.error(f'source file {file_path} not exist!')
logger.error(f'absolute path: {os.path.abspath(file_path)}')
logger.error(f"source file {file_path} not exist!")
logger.error(f"absolute path: {os.path.abspath(file_path)}")
raise FileNotFoundError(file_path)
try:
prs = Presentation(file_path)
prs = Presentation(str(file_path))
except KeyError as err:
if len(err.args) > 0 and re.match(r'There is no item named .*NULL.* in the archive', str(err.args[0])):
logger.info('corrupted links found, trying to purge...')
if len(err.args) > 0 and re.match(
r"There is no item named .*NULL.* in the archive", str(err.args[0])
):
logger.info("corrupted links found, trying to purge...")
try:
res_path = fix_null_rels(file_path)
logger.info(f'purged file saved to {res_path}.')
logger.info(f"purged file saved to {res_path}.")
prs = Presentation(res_path)
except:
logger.error(
'failed to purge corrupted links, you can report this at https://github.com/ssine/pptx2md/issues')
"failed to purge corrupted links, you can report this at https://github.com/ssine/pptx2md/issues"
)
raise err
else:
logger.error('unknown error, you can report this at https://github.com/ssine/pptx2md/issues')
logger.error(
"unknown error, you can report this at https://github.com/ssine/pptx2md/issues"
)
raise err
return prs


def load_pptx(config: ConversionConfig) -> Presentation:
"""Load a PowerPoint presentation from a file-like object or a file path."""
if config.pptx:
return load_pptx_from_io(config.pptx)
elif config.pptx_path:
return load_pptx_from_path(config.pptx_path)


def prepare_titles(title_path: Path) -> dict[str, int]:
titles: dict[str, int] = {}
with open(title_path, 'r', encoding='utf8') as f:
with open(title_path, "r", encoding="utf8") as f:
indent = -1
for line in f.readlines():
cnt = 0
while line[cnt] == ' ':
while line[cnt] == " ":
cnt += 1
if cnt == 0:
titles[line.strip()] = 1
Expand All @@ -98,4 +126,4 @@ def prepare_titles(title_path: Path) -> dict[str, int]:

def rgb_to_hex(rgb):
r, g, b = rgb
return f'#{r:02x}{g:02x}{b:02x}'
return f"#{r:02x}{g:02x}{b:02x}"