Initial commit - Kindle Clippings Parser

🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-05-28 20:02:32 +02:00
commit 4d3b167c28
7 changed files with 681 additions and 0 deletions
--- a/main.py
+++ b/main.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+"""
+Kindle Clippings to Markdown Converter
+
+This script parses a "My Clippings.txt" file from Kindle and converts it 
+to structured markdown files, creating a separate file for each book.
+Optionally uploads the results to a WebDAV server.
+"""
+
+import re
+from collections import defaultdict
+from pathlib import Path
+import argparse
+import requests
+from requests.auth import HTTPBasicAuth
+import sys
+import os
+from xml.etree import ElementTree as ET
+from config import get_webdav_config
+
+
+def clean_title(title):
+    """Clean and normalize book title."""
+    return title.strip().replace('\r', '').replace('\n', '')
+
+
+def extract_page_info(metadata_line):
+    """
+    Extract page information from the metadata line.
+    
+    Args:
+        metadata_line (str): The line containing page and position info
+        
+    Returns:
+        str: Formatted page reference or empty string if not found
+    """
+    metadata_line = metadata_line.lower()
+    
+    # Look for page information
+    page_match = re.search(r'pagina\s+(\d+)', metadata_line)
+    if page_match:
+        page_num = page_match.group(1)
+        return f"(p. {page_num})"
+    
+    # If no page found, look for position as fallback
+    position_match = re.search(r'posizione\s+(\d+)', metadata_line)
+    if position_match:
+        position_num = position_match.group(1)
+        return f"(pos. {position_num})"
+    
+    return ""
+
+
+def sanitize_filename(filename):
+    """
+    Sanitize filename by removing/replacing invalid characters while preserving spaces.
+    
+    Args:
+        filename (str): Original filename
+        
+    Returns:
+        str: Sanitized filename safe for filesystem with preserved spaces
+    """
+    # Replace invalid filesystem characters with safe alternatives
+    replacements = {
+        '<': '⟨',   # Mathematical left angle bracket
+        '>': '⟩',   # Mathematical right angle bracket
+        ':': '꞉',   # Modifier letter colon
+        '"': '"',   # Right double quotation mark
+        '/': '∕',   # Division slash
+        '\\': '∖',  # Set minus
+        '|': '❘',   # Light vertical bar
+        '?': '？',  # Full-width question mark
+        '*': '∗',   # Asterisk operator
+    }
+    
+    # Apply character replacements
+    for invalid_char, safe_char in replacements.items():
+        filename = filename.replace(invalid_char, safe_char)
+    
+    # Clean up multiple spaces but preserve single spaces
+    filename = re.sub(r'\s+', ' ', filename.strip())
+    
+    # Remove leading/trailing dots and spaces that could cause issues
+    filename = filename.strip('. ')
+    
+    # Limit length to avoid filesystem issues (keeping some margin for .md extension)
+    if len(filename) > 200:
+        filename = filename[:200].strip()
+    
+    # Ensure filename is not empty and doesn't end with problematic characters
+    if not filename or filename.endswith('.'):
+        filename = filename.rstrip('.') or 'Untitled Book'
+    
+    return filename
+
+
+def parse_clippings_file(file_path):
+    """
+    Parse the My Clippings.txt file and extract book titles, citations, and page references.
+    
+    Args:
+        file_path (str): Path to the My Clippings.txt file
+        
+    Returns:
+        dict: Dictionary with book titles as keys and lists of (citation, page_ref) tuples as values
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+    except UnicodeDecodeError:
+        # Try with different encoding if UTF-8 fails
+        with open(file_path, 'r', encoding='latin-1') as file:
+            content = file.read()
+    
+    # Split entries by the delimiter
+    entries = content.split('==========')
+    
+    # Dictionary to group citations by book title
+    books = defaultdict(list)
+    
+    for entry in entries:
+        entry = entry.strip()
+        if not entry:
+            continue
+            
+        # Split entry into lines and filter empty ones
+        lines = [line.strip() for line in entry.split('\n') if line.strip()]
+        
+        if len(lines) < 2:
+            continue
+            
+        # First line is the book title
+        book_title = clean_title(lines[0])
+        
+        # Second line contains metadata (page, position, etc.)
+        metadata_line = lines[1]
+        page_ref = extract_page_info(metadata_line)
+        
+        # Third line onwards (if exists) contains the actual citation
+        if len(lines) > 2:
+            # Join all content lines (in case citation spans multiple lines)
+            citation = ' '.join(lines[2:]).strip()
+            
+            # Only add non-empty citations
+            if citation and citation != '':
+                # Check if it's a bookmark entry (no actual text content)
+                meta_line = lines[1].lower()
+                if 'segnalibro' in meta_line and len(citation) < 10:
+                    # Skip bookmark entries with very short or no text
+                    continue
+                    
+                books[book_title].append((citation, page_ref))
+    
+    return dict(books)
+
+
+def generate_markdown_files(books_dict, output_dir="kindle_books"):
+    """
+    Generate separate markdown files for each book with page references.
+    
+    Args:
+        books_dict (dict): Dictionary with book titles and (citation, page_ref) tuples
+        output_dir (str): Directory to save markdown files
+        
+    Returns:
+        list: List of generated file paths
+    """
+    # Create output directory if it doesn't exist
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+    
+    generated_files = []
+    
+    # Sort books alphabetically
+    sorted_books = sorted(books_dict.items())
+    
+    for book_title, citations_with_pages in sorted_books:
+        if not citations_with_pages:  # Skip books with no citations
+            continue
+        
+        # Create sanitized filename
+        safe_filename = sanitize_filename(book_title)
+        file_path = output_path / f"{safe_filename}.md"
+        
+        # Generate markdown content for this book
+        markdown_content = []
+        
+        # Add each citation as a blockquote with page reference
+        for citation, page_ref in citations_with_pages:
+            # Clean up citation text
+            citation = citation.replace('\r', '').replace('\n', ' ').strip()
+            if citation:
+                # Format the citation with page reference
+                if page_ref:
+                    markdown_content.append(f"> {citation} {page_ref}\n")
+                else:
+                    markdown_content.append(f"> {citation}\n")
+        
+        # Write to file
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(markdown_content))
+        
+        generated_files.append(str(file_path))
+        print(f"Generated: {file_path}")
+    
+    total_citations = sum(len(citations) for citations in books_dict.values())
+    print(f"\nTotal books processed: {len(generated_files)}")
+    print(f"Total citations: {total_citations}")
+    print(f"Files saved in: {output_path.absolute()}")
+    
+    return generated_files
+
+
+def get_webdav_directory_contents(base_url, auth):
+    """
+    Get list of files in the WebDAV directory using PROPFIND request.
+    
+    Args:
+        base_url (str): WebDAV base URL
+        auth: HTTPBasicAuth object
+        
+    Returns:
+        list: List of file URLs found in the directory
+    """
+    try:
+        # PROPFIND request to list directory contents
+        headers = {
+            'Depth': '1',
+            'Content-Type': 'application/xml'
+        }
+        
+        # Basic PROPFIND body to get file names
+        propfind_body = '''<?xml version="1.0" encoding="utf-8" ?>
+        <D:propfind xmlns:D="DAV:">
+            <D:prop>
+                <D:displayname/>
+                <D:resourcetype/>
+            </D:prop>
+        </D:propfind>'''
+        
+        response = requests.request(
+            'PROPFIND',
+            base_url,
+            data=propfind_body,
+            headers=headers,
+            auth=auth,
+            timeout=30
+        )
+        
+        if response.status_code != 207:  # WebDAV Multi-Status
+            print(f"Warning: Could not list directory contents - Status: {response.status_code}")
+            return []
+        
+        # Parse the XML response
+        root = ET.fromstring(response.text)
+        file_urls = []
+        
+        # Define namespaces
+        namespaces = {'D': 'DAV:'}
+        
+        for response_elem in root.findall('.//D:response', namespaces):
+            href_elem = response_elem.find('D:href', namespaces)
+            resourcetype_elem = response_elem.find('.//D:resourcetype', namespaces)
+            
+            if href_elem is not None and resourcetype_elem is not None:
+                href = href_elem.text
+                # Skip directories (they contain D:collection element)
+                collection_elem = resourcetype_elem.find('D:collection', namespaces)
+                
+                if collection_elem is None and href:  # It's a file, not a directory
+                    # Skip the base directory itself
+                    if not href.endswith('/'):
+                        file_urls.append(href)
+        
+        return file_urls
+        
+    except Exception as e:
+        print(f"Error listing directory contents: {e}")
+        return []
+
+
+def delete_webdav_file(file_url, auth):
+    """
+    Delete a single file from WebDAV server.
+    
+    Args:
+        file_url (str): Full URL of the file to delete
+        auth: HTTPBasicAuth object
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        response = requests.delete(file_url, auth=auth, timeout=30)
+        
+        if response.status_code in [200, 204, 404]:  # 404 means already deleted
+            return True
+        else:
+            print(f"Failed to delete {file_url} - Status: {response.status_code}")
+            return False
+            
+    except Exception as e:
+        print(f"Error deleting {file_url}: {e}")
+        return False
+
+
+def clear_webdav_directory():
+    """
+    Clear all files from the WebDAV directory.
+    
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    # Get WebDAV configuration
+    webdav_config = get_webdav_config()
+    WEBDAV_BASE_URL = webdav_config['base_url']
+    WEBDAV_USERNAME = webdav_config['username']
+    WEBDAV_PASSWORD = webdav_config['password']
+    
+    auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
+    
+    print("Clearing WebDAV directory...")
+    
+    # Get list of existing files
+    file_urls = get_webdav_directory_contents(WEBDAV_BASE_URL, auth)
+    
+    if not file_urls:
+        print("No files found in WebDAV directory (or could not list contents)")
+        return True
+    
+    print(f"Found {len(file_urls)} files to delete")
+    
+    deleted_count = 0
+    failed_count = 0
+    
+    for file_url in file_urls:
+        # Convert relative URL to absolute if needed
+        if file_url.startswith('/'):
+            # Extract the base domain from WEBDAV_BASE_URL
+            from urllib.parse import urlparse
+            parsed_base = urlparse(WEBDAV_BASE_URL)
+            full_url = f"{parsed_base.scheme}://{parsed_base.netloc}{file_url}"
+        else:
+            full_url = file_url
+        
+        # Extract filename for display
+        filename = file_url.split('/')[-1]
+        
+        if delete_webdav_file(full_url, auth):
+            print(f"✓ Deleted: {filename}")
+            deleted_count += 1
+        else:
+            print(f"✗ Failed to delete: {filename}")
+            failed_count += 1
+    
+    print(f"\nCleanup Summary:")
+    print(f"✓ Files deleted: {deleted_count}")
+    print(f"✗ Failed deletions: {failed_count}")
+    
+    return failed_count == 0
+
+
+def upload_files_to_webdav(file_paths):
+    """
+    Upload multiple files to WebDAV server using preconfigured settings.
+    
+    Args:
+        file_paths (list): List of local file paths to upload
+    
+    Returns:
+        tuple: (successful_uploads, failed_uploads)
+    """
+    # Get WebDAV configuration
+    webdav_config = get_webdav_config()
+    WEBDAV_BASE_URL = webdav_config['base_url']
+    WEBDAV_USERNAME = webdav_config['username']
+    WEBDAV_PASSWORD = webdav_config['password']
+    
+    successful_uploads = []
+    failed_uploads = []
+    
+    for file_path in file_paths:
+        try:
+            # Get the filename from the local path
+            filename = Path(file_path).name
+            
+            # Construct the full WebDAV URL including the filename
+            webdav_file_url = f"{WEBDAV_BASE_URL}/{filename}"
+            
+            # Read the file content
+            with open(file_path, 'rb') as f:
+                file_content = f.read()
+            
+            # Prepare the request headers
+            headers = {
+                'Content-Type': 'text/markdown',
+                'Content-Length': str(len(file_content))
+            }
+            
+            # Set up authentication
+            auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
+            
+            print(f"Uploading: {filename}")
+            
+            # Make the PUT request to upload the file
+            response = requests.put(
+                webdav_file_url,
+                data=file_content,
+                headers=headers,
+                auth=auth,
+                timeout=30
+            )
+            
+            if response.status_code in [200, 201, 204]:
+                print(f"✓ Successfully uploaded: {filename}")
+                successful_uploads.append(file_path)
+            else:
+                print(f"✗ Upload failed for {filename} - Status: {response.status_code}")
+                print(f"  Response: {response.text}")
+                failed_uploads.append(file_path)
+                
+        except requests.exceptions.RequestException as e:
+            print(f"✗ Network error uploading {filename}: {e}")
+            failed_uploads.append(file_path)
+        except Exception as e:
+            print(f"✗ Unexpected error uploading {filename}: {e}")
+            failed_uploads.append(file_path)
+    
+    return successful_uploads, failed_uploads
+
+
+def main():
+    """Main function to run the converter."""
+    parser = argparse.ArgumentParser(
+        description="Convert Kindle My Clippings.txt to separate Markdown files for each book"
+    )
+    parser.add_argument(
+        'input_file',
+        nargs='?',
+        default='My Clippings.txt',
+        help='Path to the My Clippings.txt file (default: My Clippings.txt)'
+    )
+    parser.add_argument(
+        '-d', '--output-dir',
+        default='kindle_books',
+        help='Output directory for markdown files (default: kindle_books)'
+    )
+    parser.add_argument(
+        '--upload',
+        action='store_true',
+        help='Upload the markdown files to WebDAV server'
+    )
+    parser.add_argument(
+        '--no-clear',
+        action='store_true',
+        help='Skip clearing WebDAV directory before upload (files will be added/overwritten)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Check if input file exists
+    input_path = Path(args.input_file)
+    if not input_path.exists():
+        print(f"Error: Input file '{args.input_file}' not found.")
+        return 1
+    
+    try:
+        # Parse the clippings file
+        print(f"Parsing {args.input_file}...")
+        books = parse_clippings_file(args.input_file)
+        
+        # Generate markdown files
+        print("Generating markdown files...")
+        generated_files = generate_markdown_files(books, args.output_dir)
+        
+        if not generated_files:
+            print("No books with citations found.")
+            return 0
+        
+        # Upload to WebDAV if requested
+        if args.upload:
+            # Clear WebDAV directory first (unless --no-clear is specified)
+            if not args.no_clear:
+                if not clear_webdav_directory():
+                    print("Warning: Some files could not be deleted from WebDAV directory.")
+                    print("Continuing with upload (files will be overwritten)...")
+                print()  # Empty line for better readability
+            
+            print("Uploading to WebDAV server...")
+            
+            successful, failed = upload_files_to_webdav(generated_files)
+            
+            print(f"\nUpload Summary:")
+            print(f"✓ Successful uploads: {len(successful)}")
+            print(f"✗ Failed uploads: {len(failed)}")
+            
+            if failed:
+                print("Warning: Some files failed to upload.")
+                return 1
+        
+        return 0
+        
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())