Remove unused imports and fix trailing whitespace

This commit is contained in:
JoelShepard
2025-05-31 14:52:18 +02:00
parent 4d3b167c28
commit f78610e104

207
main.py
View File

@@ -2,7 +2,7 @@
""" """
Kindle Clippings to Markdown Converter Kindle Clippings to Markdown Converter
This script parses a "My Clippings.txt" file from Kindle and converts it This script parses a "My Clippings.txt" file from Kindle and converts it
to structured markdown files, creating a separate file for each book. to structured markdown files, creating a separate file for each book.
Optionally uploads the results to a WebDAV server. Optionally uploads the results to a WebDAV server.
""" """
@@ -13,8 +13,6 @@ from pathlib import Path
import argparse import argparse
import requests import requests
from requests.auth import HTTPBasicAuth from requests.auth import HTTPBasicAuth
import sys
import os
from xml.etree import ElementTree as ET from xml.etree import ElementTree as ET
from config import get_webdav_config from config import get_webdav_config
@@ -27,37 +25,37 @@ def clean_title(title):
def extract_page_info(metadata_line): def extract_page_info(metadata_line):
""" """
Extract page information from the metadata line. Extract page information from the metadata line.
Args: Args:
metadata_line (str): The line containing page and position info metadata_line (str): The line containing page and position info
Returns: Returns:
str: Formatted page reference or empty string if not found str: Formatted page reference or empty string if not found
""" """
metadata_line = metadata_line.lower() metadata_line = metadata_line.lower()
# Look for page information # Look for page information
page_match = re.search(r'pagina\s+(\d+)', metadata_line) page_match = re.search(r'pagina\s+(\d+)', metadata_line)
if page_match: if page_match:
page_num = page_match.group(1) page_num = page_match.group(1)
return f"(p. {page_num})" return f"(p. {page_num})"
# If no page found, look for position as fallback # If no page found, look for position as fallback
position_match = re.search(r'posizione\s+(\d+)', metadata_line) position_match = re.search(r'posizione\s+(\d+)', metadata_line)
if position_match: if position_match:
position_num = position_match.group(1) position_num = position_match.group(1)
return f"(pos. {position_num})" return f"(pos. {position_num})"
return "" return ""
def sanitize_filename(filename): def sanitize_filename(filename):
""" """
Sanitize filename by removing/replacing invalid characters while preserving spaces. Sanitize filename by removing/replacing invalid characters while preserving spaces.
Args: Args:
filename (str): Original filename filename (str): Original filename
Returns: Returns:
str: Sanitized filename safe for filesystem with preserved spaces str: Sanitized filename safe for filesystem with preserved spaces
""" """
@@ -73,35 +71,35 @@ def sanitize_filename(filename):
'?': '', # Full-width question mark '?': '', # Full-width question mark
'*': '', # Asterisk operator '*': '', # Asterisk operator
} }
# Apply character replacements # Apply character replacements
for invalid_char, safe_char in replacements.items(): for invalid_char, safe_char in replacements.items():
filename = filename.replace(invalid_char, safe_char) filename = filename.replace(invalid_char, safe_char)
# Clean up multiple spaces but preserve single spaces # Clean up multiple spaces but preserve single spaces
filename = re.sub(r'\s+', ' ', filename.strip()) filename = re.sub(r'\s+', ' ', filename.strip())
# Remove leading/trailing dots and spaces that could cause issues # Remove leading/trailing dots and spaces that could cause issues
filename = filename.strip('. ') filename = filename.strip('. ')
# Limit length to avoid filesystem issues (keeping some margin for .md extension) # Limit length to avoid filesystem issues (keeping some margin for .md extension)
if len(filename) > 200: if len(filename) > 200:
filename = filename[:200].strip() filename = filename[:200].strip()
# Ensure filename is not empty and doesn't end with problematic characters # Ensure filename is not empty and doesn't end with problematic characters
if not filename or filename.endswith('.'): if not filename or filename.endswith('.'):
filename = filename.rstrip('.') or 'Untitled Book' filename = filename.rstrip('.') or 'Untitled Book'
return filename return filename
def parse_clippings_file(file_path): def parse_clippings_file(file_path):
""" """
Parse the My Clippings.txt file and extract book titles, citations, and page references. Parse the My Clippings.txt file and extract book titles, citations, and page references.
Args: Args:
file_path (str): Path to the My Clippings.txt file file_path (str): Path to the My Clippings.txt file
Returns: Returns:
dict: Dictionary with book titles as keys and lists of (citation, page_ref) tuples as values dict: Dictionary with book titles as keys and lists of (citation, page_ref) tuples as values
""" """
@@ -112,36 +110,36 @@ def parse_clippings_file(file_path):
# Try with different encoding if UTF-8 fails # Try with different encoding if UTF-8 fails
with open(file_path, 'r', encoding='latin-1') as file: with open(file_path, 'r', encoding='latin-1') as file:
content = file.read() content = file.read()
# Split entries by the delimiter # Split entries by the delimiter
entries = content.split('==========') entries = content.split('==========')
# Dictionary to group citations by book title # Dictionary to group citations by book title
books = defaultdict(list) books = defaultdict(list)
for entry in entries: for entry in entries:
entry = entry.strip() entry = entry.strip()
if not entry: if not entry:
continue continue
# Split entry into lines and filter empty ones # Split entry into lines and filter empty ones
lines = [line.strip() for line in entry.split('\n') if line.strip()] lines = [line.strip() for line in entry.split('\n') if line.strip()]
if len(lines) < 2: if len(lines) < 2:
continue continue
# First line is the book title # First line is the book title
book_title = clean_title(lines[0]) book_title = clean_title(lines[0])
# Second line contains metadata (page, position, etc.) # Second line contains metadata (page, position, etc.)
metadata_line = lines[1] metadata_line = lines[1]
page_ref = extract_page_info(metadata_line) page_ref = extract_page_info(metadata_line)
# Third line onwards (if exists) contains the actual citation # Third line onwards (if exists) contains the actual citation
if len(lines) > 2: if len(lines) > 2:
# Join all content lines (in case citation spans multiple lines) # Join all content lines (in case citation spans multiple lines)
citation = ' '.join(lines[2:]).strip() citation = ' '.join(lines[2:]).strip()
# Only add non-empty citations # Only add non-empty citations
if citation and citation != '': if citation and citation != '':
# Check if it's a bookmark entry (no actual text content) # Check if it's a bookmark entry (no actual text content)
@@ -149,43 +147,43 @@ def parse_clippings_file(file_path):
if 'segnalibro' in meta_line and len(citation) < 10: if 'segnalibro' in meta_line and len(citation) < 10:
# Skip bookmark entries with very short or no text # Skip bookmark entries with very short or no text
continue continue
books[book_title].append((citation, page_ref)) books[book_title].append((citation, page_ref))
return dict(books) return dict(books)
def generate_markdown_files(books_dict, output_dir="kindle_books"): def generate_markdown_files(books_dict, output_dir="kindle_books"):
""" """
Generate separate markdown files for each book with page references. Generate separate markdown files for each book with page references.
Args: Args:
books_dict (dict): Dictionary with book titles and (citation, page_ref) tuples books_dict (dict): Dictionary with book titles and (citation, page_ref) tuples
output_dir (str): Directory to save markdown files output_dir (str): Directory to save markdown files
Returns: Returns:
list: List of generated file paths list: List of generated file paths
""" """
# Create output directory if it doesn't exist # Create output directory if it doesn't exist
output_path = Path(output_dir) output_path = Path(output_dir)
output_path.mkdir(exist_ok=True) output_path.mkdir(exist_ok=True)
generated_files = [] generated_files = []
# Sort books alphabetically # Sort books alphabetically
sorted_books = sorted(books_dict.items()) sorted_books = sorted(books_dict.items())
for book_title, citations_with_pages in sorted_books: for book_title, citations_with_pages in sorted_books:
if not citations_with_pages: # Skip books with no citations if not citations_with_pages: # Skip books with no citations
continue continue
# Create sanitized filename # Create sanitized filename
safe_filename = sanitize_filename(book_title) safe_filename = sanitize_filename(book_title)
file_path = output_path / f"{safe_filename}.md" file_path = output_path / f"{safe_filename}.md"
# Generate markdown content for this book # Generate markdown content for this book
markdown_content = [] markdown_content = []
# Add each citation as a blockquote with page reference # Add each citation as a blockquote with page reference
for citation, page_ref in citations_with_pages: for citation, page_ref in citations_with_pages:
# Clean up citation text # Clean up citation text
@@ -196,30 +194,30 @@ def generate_markdown_files(books_dict, output_dir="kindle_books"):
markdown_content.append(f"> {citation} {page_ref}\n") markdown_content.append(f"> {citation} {page_ref}\n")
else: else:
markdown_content.append(f"> {citation}\n") markdown_content.append(f"> {citation}\n")
# Write to file # Write to file
with open(file_path, 'w', encoding='utf-8') as f: with open(file_path, 'w', encoding='utf-8') as f:
f.write('\n'.join(markdown_content)) f.write('\n'.join(markdown_content))
generated_files.append(str(file_path)) generated_files.append(str(file_path))
print(f"Generated: {file_path}") print(f"Generated: {file_path}")
total_citations = sum(len(citations) for citations in books_dict.values()) total_citations = sum(len(citations) for citations in books_dict.values())
print(f"\nTotal books processed: {len(generated_files)}") print(f"\nTotal books processed: {len(generated_files)}")
print(f"Total citations: {total_citations}") print(f"Total citations: {total_citations}")
print(f"Files saved in: {output_path.absolute()}") print(f"Files saved in: {output_path.absolute()}")
return generated_files return generated_files
def get_webdav_directory_contents(base_url, auth): def get_webdav_directory_contents(base_url, auth):
""" """
Get list of files in the WebDAV directory using PROPFIND request. Get list of files in the WebDAV directory using PROPFIND request.
Args: Args:
base_url (str): WebDAV base URL base_url (str): WebDAV base URL
auth: HTTPBasicAuth object auth: HTTPBasicAuth object
Returns: Returns:
list: List of file URLs found in the directory list: List of file URLs found in the directory
""" """
@@ -229,7 +227,7 @@ def get_webdav_directory_contents(base_url, auth):
'Depth': '1', 'Depth': '1',
'Content-Type': 'application/xml' 'Content-Type': 'application/xml'
} }
# Basic PROPFIND body to get file names # Basic PROPFIND body to get file names
propfind_body = '''<?xml version="1.0" encoding="utf-8" ?> propfind_body = '''<?xml version="1.0" encoding="utf-8" ?>
<D:propfind xmlns:D="DAV:"> <D:propfind xmlns:D="DAV:">
@@ -238,7 +236,7 @@ def get_webdav_directory_contents(base_url, auth):
<D:resourcetype/> <D:resourcetype/>
</D:prop> </D:prop>
</D:propfind>''' </D:propfind>'''
response = requests.request( response = requests.request(
'PROPFIND', 'PROPFIND',
base_url, base_url,
@@ -247,34 +245,34 @@ def get_webdav_directory_contents(base_url, auth):
auth=auth, auth=auth,
timeout=30 timeout=30
) )
if response.status_code != 207: # WebDAV Multi-Status if response.status_code != 207: # WebDAV Multi-Status
print(f"Warning: Could not list directory contents - Status: {response.status_code}") print(f"Warning: Could not list directory contents - Status: {response.status_code}")
return [] return []
# Parse the XML response # Parse the XML response
root = ET.fromstring(response.text) root = ET.fromstring(response.text)
file_urls = [] file_urls = []
# Define namespaces # Define namespaces
namespaces = {'D': 'DAV:'} namespaces = {'D': 'DAV:'}
for response_elem in root.findall('.//D:response', namespaces): for response_elem in root.findall('.//D:response', namespaces):
href_elem = response_elem.find('D:href', namespaces) href_elem = response_elem.find('D:href', namespaces)
resourcetype_elem = response_elem.find('.//D:resourcetype', namespaces) resourcetype_elem = response_elem.find('.//D:resourcetype', namespaces)
if href_elem is not None and resourcetype_elem is not None: if href_elem is not None and resourcetype_elem is not None:
href = href_elem.text href = href_elem.text
# Skip directories (they contain D:collection element) # Skip directories (they contain D:collection element)
collection_elem = resourcetype_elem.find('D:collection', namespaces) collection_elem = resourcetype_elem.find('D:collection', namespaces)
if collection_elem is None and href: # It's a file, not a directory if collection_elem is None and href: # It's a file, not a directory
# Skip the base directory itself # Skip the base directory itself
if not href.endswith('/'): if not href.endswith('/'):
file_urls.append(href) file_urls.append(href)
return file_urls return file_urls
except Exception as e: except Exception as e:
print(f"Error listing directory contents: {e}") print(f"Error listing directory contents: {e}")
return [] return []
@@ -283,23 +281,23 @@ def get_webdav_directory_contents(base_url, auth):
def delete_webdav_file(file_url, auth): def delete_webdav_file(file_url, auth):
""" """
Delete a single file from WebDAV server. Delete a single file from WebDAV server.
Args: Args:
file_url (str): Full URL of the file to delete file_url (str): Full URL of the file to delete
auth: HTTPBasicAuth object auth: HTTPBasicAuth object
Returns: Returns:
bool: True if successful, False otherwise bool: True if successful, False otherwise
""" """
try: try:
response = requests.delete(file_url, auth=auth, timeout=30) response = requests.delete(file_url, auth=auth, timeout=30)
if response.status_code in [200, 204, 404]: # 404 means already deleted if response.status_code in [200, 204, 404]: # 404 means already deleted
return True return True
else: else:
print(f"Failed to delete {file_url} - Status: {response.status_code}") print(f"Failed to delete {file_url} - Status: {response.status_code}")
return False return False
except Exception as e: except Exception as e:
print(f"Error deleting {file_url}: {e}") print(f"Error deleting {file_url}: {e}")
return False return False
@@ -308,7 +306,7 @@ def delete_webdav_file(file_url, auth):
def clear_webdav_directory(): def clear_webdav_directory():
""" """
Clear all files from the WebDAV directory. Clear all files from the WebDAV directory.
Returns: Returns:
bool: True if successful, False otherwise bool: True if successful, False otherwise
""" """
@@ -317,23 +315,28 @@ def clear_webdav_directory():
WEBDAV_BASE_URL = webdav_config['base_url'] WEBDAV_BASE_URL = webdav_config['base_url']
WEBDAV_USERNAME = webdav_config['username'] WEBDAV_USERNAME = webdav_config['username']
WEBDAV_PASSWORD = webdav_config['password'] WEBDAV_PASSWORD = webdav_config['password']
# Validate credentials
if not WEBDAV_USERNAME or not WEBDAV_PASSWORD:
print("Error: WebDAV username or password not configured")
return False
auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD) auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
print("Clearing WebDAV directory...") print("Clearing WebDAV directory...")
# Get list of existing files # Get list of existing files
file_urls = get_webdav_directory_contents(WEBDAV_BASE_URL, auth) file_urls = get_webdav_directory_contents(WEBDAV_BASE_URL, auth)
if not file_urls: if not file_urls:
print("No files found in WebDAV directory (or could not list contents)") print("No files found in WebDAV directory (or could not list contents)")
return True return True
print(f"Found {len(file_urls)} files to delete") print(f"Found {len(file_urls)} files to delete")
deleted_count = 0 deleted_count = 0
failed_count = 0 failed_count = 0
for file_url in file_urls: for file_url in file_urls:
# Convert relative URL to absolute if needed # Convert relative URL to absolute if needed
if file_url.startswith('/'): if file_url.startswith('/'):
@@ -343,31 +346,31 @@ def clear_webdav_directory():
full_url = f"{parsed_base.scheme}://{parsed_base.netloc}{file_url}" full_url = f"{parsed_base.scheme}://{parsed_base.netloc}{file_url}"
else: else:
full_url = file_url full_url = file_url
# Extract filename for display # Extract filename for display
filename = file_url.split('/')[-1] filename = file_url.split('/')[-1]
if delete_webdav_file(full_url, auth): if delete_webdav_file(full_url, auth):
print(f"✓ Deleted: {filename}") print(f"✓ Deleted: {filename}")
deleted_count += 1 deleted_count += 1
else: else:
print(f"✗ Failed to delete: {filename}") print(f"✗ Failed to delete: {filename}")
failed_count += 1 failed_count += 1
print(f"\nCleanup Summary:") print("\nCleanup Summary:")
print(f"✓ Files deleted: {deleted_count}") print(f"✓ Files deleted: {deleted_count}")
print(f"✗ Failed deletions: {failed_count}") print(f"✗ Failed deletions: {failed_count}")
return failed_count == 0 return failed_count == 0
def upload_files_to_webdav(file_paths): def upload_files_to_webdav(file_paths):
""" """
Upload multiple files to WebDAV server using preconfigured settings. Upload multiple files to WebDAV server using preconfigured settings.
Args: Args:
file_paths (list): List of local file paths to upload file_paths (list): List of local file paths to upload
Returns: Returns:
tuple: (successful_uploads, failed_uploads) tuple: (successful_uploads, failed_uploads)
""" """
@@ -376,33 +379,39 @@ def upload_files_to_webdav(file_paths):
WEBDAV_BASE_URL = webdav_config['base_url'] WEBDAV_BASE_URL = webdav_config['base_url']
WEBDAV_USERNAME = webdav_config['username'] WEBDAV_USERNAME = webdav_config['username']
WEBDAV_PASSWORD = webdav_config['password'] WEBDAV_PASSWORD = webdav_config['password']
# Validate credentials
if not WEBDAV_USERNAME or not WEBDAV_PASSWORD:
print("Error: WebDAV username or password not configured")
return [], file_paths
successful_uploads = [] successful_uploads = []
failed_uploads = [] failed_uploads = []
for file_path in file_paths: for file_path in file_paths:
filename = None # Initialize filename variable
try: try:
# Get the filename from the local path # Get the filename from the local path
filename = Path(file_path).name filename = Path(file_path).name
# Construct the full WebDAV URL including the filename # Construct the full WebDAV URL including the filename
webdav_file_url = f"{WEBDAV_BASE_URL}/{filename}" webdav_file_url = f"{WEBDAV_BASE_URL}/{filename}"
# Read the file content # Read the file content
with open(file_path, 'rb') as f: with open(file_path, 'rb') as f:
file_content = f.read() file_content = f.read()
# Prepare the request headers # Prepare the request headers
headers = { headers = {
'Content-Type': 'text/markdown', 'Content-Type': 'text/markdown',
'Content-Length': str(len(file_content)) 'Content-Length': str(len(file_content))
} }
# Set up authentication # Set up authentication
auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD) auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
print(f"Uploading: {filename}") print(f"Uploading: {filename}")
# Make the PUT request to upload the file # Make the PUT request to upload the file
response = requests.put( response = requests.put(
webdav_file_url, webdav_file_url,
@@ -411,7 +420,7 @@ def upload_files_to_webdav(file_paths):
auth=auth, auth=auth,
timeout=30 timeout=30
) )
if response.status_code in [200, 201, 204]: if response.status_code in [200, 201, 204]:
print(f"✓ Successfully uploaded: {filename}") print(f"✓ Successfully uploaded: {filename}")
successful_uploads.append(file_path) successful_uploads.append(file_path)
@@ -419,14 +428,16 @@ def upload_files_to_webdav(file_paths):
print(f"✗ Upload failed for {filename} - Status: {response.status_code}") print(f"✗ Upload failed for {filename} - Status: {response.status_code}")
print(f" Response: {response.text}") print(f" Response: {response.text}")
failed_uploads.append(file_path) failed_uploads.append(file_path)
except requests.exceptions.RequestException as e: except requests.exceptions.RequestException as e:
print(f"✗ Network error uploading {filename}: {e}") display_filename = filename if filename else Path(file_path).name
print(f"✗ Network error uploading {display_filename}: {e}")
failed_uploads.append(file_path) failed_uploads.append(file_path)
except Exception as e: except Exception as e:
print(f"✗ Unexpected error uploading {filename}: {e}") display_filename = filename if filename else Path(file_path).name
print(f"✗ Unexpected error uploading {display_filename}: {e}")
failed_uploads.append(file_path) failed_uploads.append(file_path)
return successful_uploads, failed_uploads return successful_uploads, failed_uploads
@@ -456,28 +467,28 @@ def main():
action='store_true', action='store_true',
help='Skip clearing WebDAV directory before upload (files will be added/overwritten)' help='Skip clearing WebDAV directory before upload (files will be added/overwritten)'
) )
args = parser.parse_args() args = parser.parse_args()
# Check if input file exists # Check if input file exists
input_path = Path(args.input_file) input_path = Path(args.input_file)
if not input_path.exists(): if not input_path.exists():
print(f"Error: Input file '{args.input_file}' not found.") print(f"Error: Input file '{args.input_file}' not found.")
return 1 return 1
try: try:
# Parse the clippings file # Parse the clippings file
print(f"Parsing {args.input_file}...") print(f"Parsing {args.input_file}...")
books = parse_clippings_file(args.input_file) books = parse_clippings_file(args.input_file)
# Generate markdown files # Generate markdown files
print("Generating markdown files...") print("Generating markdown files...")
generated_files = generate_markdown_files(books, args.output_dir) generated_files = generate_markdown_files(books, args.output_dir)
if not generated_files: if not generated_files:
print("No books with citations found.") print("No books with citations found.")
return 0 return 0
# Upload to WebDAV if requested # Upload to WebDAV if requested
if args.upload: if args.upload:
# Clear WebDAV directory first (unless --no-clear is specified) # Clear WebDAV directory first (unless --no-clear is specified)
@@ -486,21 +497,21 @@ def main():
print("Warning: Some files could not be deleted from WebDAV directory.") print("Warning: Some files could not be deleted from WebDAV directory.")
print("Continuing with upload (files will be overwritten)...") print("Continuing with upload (files will be overwritten)...")
print() # Empty line for better readability print() # Empty line for better readability
print("Uploading to WebDAV server...") print("Uploading to WebDAV server...")
successful, failed = upload_files_to_webdav(generated_files) successful, failed = upload_files_to_webdav(generated_files)
print(f"\nUpload Summary:") print("\nUpload Summary:")
print(f"✓ Successful uploads: {len(successful)}") print(f"✓ Successful uploads: {len(successful)}")
print(f"✗ Failed uploads: {len(failed)}") print(f"✗ Failed uploads: {len(failed)}")
if failed: if failed:
print("Warning: Some files failed to upload.") print("Warning: Some files failed to upload.")
return 1 return 1
return 0 return 0
except Exception as e: except Exception as e:
print(f"Error processing file: {e}") print(f"Error processing file: {e}")
return 1 return 1