Initial commit - Kindle Clippings Parser

🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-05-28 20:02:32 +02:00
commit 4d3b167c28
7 changed files with 681 additions and 0 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -0,0 +1,7 @@
+kindle_books/*
+__pycache__/
+.ropeproject
+
+# Configuration files with sensitive data
+config.json
+My Clippings.txt
--- a/9
+++ b/9
@@ -0,0 +1,9 @@
+MIT License
+
+Copyright (c) 2025 JoelShepard
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/README.md
+++ b/README.md
@@ -0,0 +1,88 @@
+# Kindle Clippings Parser
+
+A Python tool to convert Kindle's "My Clippings.txt" file into organized Markdown files, creating separate documents for each book with optional WebDAV upload functionality.
+
+## Features
+
+- **Parse Kindle clippings**: Extract highlights and notes from "My Clippings.txt"
+- **Organize by book**: Create separate Markdown files for each book
+- **Page references**: Include page numbers or positions with each citation
+- **Safe filenames**: Automatically sanitize book titles for filesystem compatibility
+- **WebDAV upload**: Optional upload to cloud storage via WebDAV
+- **Batch operations**: Clear and update entire collections
+
+## Installation
+
+1. Clone this repository:
+   ```bash
+   git clone <repository-url>
+   cd KindleClippingsParser
+   ```
+
+2. Install Python dependencies:
+   ```bash
+   pip install requests
+   ```
+
+## Usage
+
+### Basic Usage
+
+Convert your Kindle clippings to Markdown files:
+
+```bash
+python main.py "My Clippings.txt"
+```
+
+### Advanced Options
+
+```bash
+# Specify custom output directory
+python main.py "My Clippings.txt" -d custom_output_folder
+
+# Upload to WebDAV server
+python main.py "My Clippings.txt" --upload
+
+# Upload without clearing existing files
+python main.py "My Clippings.txt" --upload --no-clear
+```
+
+### WebDAV Configuration
+
+For WebDAV uploads, create a `config.json` file:
+
+```json
+{
+  "webdav": {
+    "base_url": "https://your-webdav-server.com/path",
+    "username": "your-username",
+    "password": "your-password"
+  }
+}
+```
+
+Alternatively, use environment variables:
+```bash
+export WEBDAV_BASE_URL="https://your-webdav-server.com/path"
+export WEBDAV_USERNAME="your-username"
+export WEBDAV_PASSWORD="your-password"
+```
+
+## Output Format
+
+Each book generates a Markdown file with citations formatted as blockquotes:
+
+```markdown
+> "Your highlighted text here" (p. 42)
+
+> "Another highlight from the same book" (pos. 1234)
+```
+
+## Requirements
+
+- Python 3.6+
+- `requests` library for WebDAV functionality
+
+## License
+
+This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
--- a/ToDo.md
+++ b/ToDo.md
@@ -0,0 +1,2 @@
+- Manipolazione del file MyClippings.txt diretta con Kindle collegato, con conseguente upload.
+- Interfaccia GUI
--- a/config.json.template
+++ b/config.json.template
@@ -0,0 +1,7 @@
+{
+  "webdav": {
+    "base_url": "https://your-webdav-server.com/path/to/kindle/folder",
+    "username": "your-username",
+    "password": "your-password"
+  }
+}
--- a/config.py
+++ b/config.py
@@ -0,0 +1,58 @@
+#!/usr/bin/env python3
+"""
+Configuration module for Kindle Clippings Parser
+"""
+
+import os
+import json
+from pathlib import Path
+
+
+def load_config():
+    """
+    Load configuration from config.json file or environment variables.
+    
+    Returns:
+        dict: Configuration dictionary with WebDAV settings
+    """
+    config = {}
+    
+    # Try to load from config.json first
+    config_file = Path("config.json")
+    if config_file.exists():
+        try:
+            with open(config_file, 'r', encoding='utf-8') as f:
+                config = json.load(f)
+        except (json.JSONDecodeError, IOError) as e:
+            print(f"Warning: Could not load config.json: {e}")
+            config = {}
+    
+    # Override with environment variables if they exist
+    webdav_config = {
+        'base_url': config.get('webdav', {}).get('base_url') or os.getenv('WEBDAV_BASE_URL'),
+        'username': config.get('webdav', {}).get('username') or os.getenv('WEBDAV_USERNAME'),
+        'password': config.get('webdav', {}).get('password') or os.getenv('WEBDAV_PASSWORD')
+    }
+    
+    # Validate required settings
+    missing_settings = [key for key, value in webdav_config.items() if not value]
+    if missing_settings:
+        raise ValueError(
+            f"Missing required WebDAV configuration: {', '.join(missing_settings)}. "
+            "Please provide them in config.json or as environment variables."
+        )
+    
+    return {
+        'webdav': webdav_config
+    }
+
+
+def get_webdav_config():
+    """
+    Get WebDAV configuration.
+    
+    Returns:
+        dict: WebDAV configuration with base_url, username, password
+    """
+    config = load_config()
+    return config['webdav']
--- a/main.py
+++ b/main.py
@@ -0,0 +1,510 @@
+#!/usr/bin/env python3
+"""
+Kindle Clippings to Markdown Converter
+
+This script parses a "My Clippings.txt" file from Kindle and converts it 
+to structured markdown files, creating a separate file for each book.
+Optionally uploads the results to a WebDAV server.
+"""
+
+import re
+from collections import defaultdict
+from pathlib import Path
+import argparse
+import requests
+from requests.auth import HTTPBasicAuth
+import sys
+import os
+from xml.etree import ElementTree as ET
+from config import get_webdav_config
+
+
+def clean_title(title):
+    """Clean and normalize book title."""
+    return title.strip().replace('\r', '').replace('\n', '')
+
+
+def extract_page_info(metadata_line):
+    """
+    Extract page information from the metadata line.
+    
+    Args:
+        metadata_line (str): The line containing page and position info
+        
+    Returns:
+        str: Formatted page reference or empty string if not found
+    """
+    metadata_line = metadata_line.lower()
+    
+    # Look for page information
+    page_match = re.search(r'pagina\s+(\d+)', metadata_line)
+    if page_match:
+        page_num = page_match.group(1)
+        return f"(p. {page_num})"
+    
+    # If no page found, look for position as fallback
+    position_match = re.search(r'posizione\s+(\d+)', metadata_line)
+    if position_match:
+        position_num = position_match.group(1)
+        return f"(pos. {position_num})"
+    
+    return ""
+
+
+def sanitize_filename(filename):
+    """
+    Sanitize filename by removing/replacing invalid characters while preserving spaces.
+    
+    Args:
+        filename (str): Original filename
+        
+    Returns:
+        str: Sanitized filename safe for filesystem with preserved spaces
+    """
+    # Replace invalid filesystem characters with safe alternatives
+    replacements = {
+        '<': '⟨',   # Mathematical left angle bracket
+        '>': '⟩',   # Mathematical right angle bracket
+        ':': '꞉',   # Modifier letter colon
+        '"': '"',   # Right double quotation mark
+        '/': '∕',   # Division slash
+        '\\': '∖',  # Set minus
+        '|': '❘',   # Light vertical bar
+        '?': '？',  # Full-width question mark
+        '*': '∗',   # Asterisk operator
+    }
+    
+    # Apply character replacements
+    for invalid_char, safe_char in replacements.items():
+        filename = filename.replace(invalid_char, safe_char)
+    
+    # Clean up multiple spaces but preserve single spaces
+    filename = re.sub(r'\s+', ' ', filename.strip())
+    
+    # Remove leading/trailing dots and spaces that could cause issues
+    filename = filename.strip('. ')
+    
+    # Limit length to avoid filesystem issues (keeping some margin for .md extension)
+    if len(filename) > 200:
+        filename = filename[:200].strip()
+    
+    # Ensure filename is not empty and doesn't end with problematic characters
+    if not filename or filename.endswith('.'):
+        filename = filename.rstrip('.') or 'Untitled Book'
+    
+    return filename
+
+
+def parse_clippings_file(file_path):
+    """
+    Parse the My Clippings.txt file and extract book titles, citations, and page references.
+    
+    Args:
+        file_path (str): Path to the My Clippings.txt file
+        
+    Returns:
+        dict: Dictionary with book titles as keys and lists of (citation, page_ref) tuples as values
+    """
+    try:
+        with open(file_path, 'r', encoding='utf-8') as file:
+            content = file.read()
+    except UnicodeDecodeError:
+        # Try with different encoding if UTF-8 fails
+        with open(file_path, 'r', encoding='latin-1') as file:
+            content = file.read()
+    
+    # Split entries by the delimiter
+    entries = content.split('==========')
+    
+    # Dictionary to group citations by book title
+    books = defaultdict(list)
+    
+    for entry in entries:
+        entry = entry.strip()
+        if not entry:
+            continue
+            
+        # Split entry into lines and filter empty ones
+        lines = [line.strip() for line in entry.split('\n') if line.strip()]
+        
+        if len(lines) < 2:
+            continue
+            
+        # First line is the book title
+        book_title = clean_title(lines[0])
+        
+        # Second line contains metadata (page, position, etc.)
+        metadata_line = lines[1]
+        page_ref = extract_page_info(metadata_line)
+        
+        # Third line onwards (if exists) contains the actual citation
+        if len(lines) > 2:
+            # Join all content lines (in case citation spans multiple lines)
+            citation = ' '.join(lines[2:]).strip()
+            
+            # Only add non-empty citations
+            if citation and citation != '':
+                # Check if it's a bookmark entry (no actual text content)
+                meta_line = lines[1].lower()
+                if 'segnalibro' in meta_line and len(citation) < 10:
+                    # Skip bookmark entries with very short or no text
+                    continue
+                    
+                books[book_title].append((citation, page_ref))
+    
+    return dict(books)
+
+
+def generate_markdown_files(books_dict, output_dir="kindle_books"):
+    """
+    Generate separate markdown files for each book with page references.
+    
+    Args:
+        books_dict (dict): Dictionary with book titles and (citation, page_ref) tuples
+        output_dir (str): Directory to save markdown files
+        
+    Returns:
+        list: List of generated file paths
+    """
+    # Create output directory if it doesn't exist
+    output_path = Path(output_dir)
+    output_path.mkdir(exist_ok=True)
+    
+    generated_files = []
+    
+    # Sort books alphabetically
+    sorted_books = sorted(books_dict.items())
+    
+    for book_title, citations_with_pages in sorted_books:
+        if not citations_with_pages:  # Skip books with no citations
+            continue
+        
+        # Create sanitized filename
+        safe_filename = sanitize_filename(book_title)
+        file_path = output_path / f"{safe_filename}.md"
+        
+        # Generate markdown content for this book
+        markdown_content = []
+        
+        # Add each citation as a blockquote with page reference
+        for citation, page_ref in citations_with_pages:
+            # Clean up citation text
+            citation = citation.replace('\r', '').replace('\n', ' ').strip()
+            if citation:
+                # Format the citation with page reference
+                if page_ref:
+                    markdown_content.append(f"> {citation} {page_ref}\n")
+                else:
+                    markdown_content.append(f"> {citation}\n")
+        
+        # Write to file
+        with open(file_path, 'w', encoding='utf-8') as f:
+            f.write('\n'.join(markdown_content))
+        
+        generated_files.append(str(file_path))
+        print(f"Generated: {file_path}")
+    
+    total_citations = sum(len(citations) for citations in books_dict.values())
+    print(f"\nTotal books processed: {len(generated_files)}")
+    print(f"Total citations: {total_citations}")
+    print(f"Files saved in: {output_path.absolute()}")
+    
+    return generated_files
+
+
+def get_webdav_directory_contents(base_url, auth):
+    """
+    Get list of files in the WebDAV directory using PROPFIND request.
+    
+    Args:
+        base_url (str): WebDAV base URL
+        auth: HTTPBasicAuth object
+        
+    Returns:
+        list: List of file URLs found in the directory
+    """
+    try:
+        # PROPFIND request to list directory contents
+        headers = {
+            'Depth': '1',
+            'Content-Type': 'application/xml'
+        }
+        
+        # Basic PROPFIND body to get file names
+        propfind_body = '''<?xml version="1.0" encoding="utf-8" ?>
+        <D:propfind xmlns:D="DAV:">
+            <D:prop>
+                <D:displayname/>
+                <D:resourcetype/>
+            </D:prop>
+        </D:propfind>'''
+        
+        response = requests.request(
+            'PROPFIND',
+            base_url,
+            data=propfind_body,
+            headers=headers,
+            auth=auth,
+            timeout=30
+        )
+        
+        if response.status_code != 207:  # WebDAV Multi-Status
+            print(f"Warning: Could not list directory contents - Status: {response.status_code}")
+            return []
+        
+        # Parse the XML response
+        root = ET.fromstring(response.text)
+        file_urls = []
+        
+        # Define namespaces
+        namespaces = {'D': 'DAV:'}
+        
+        for response_elem in root.findall('.//D:response', namespaces):
+            href_elem = response_elem.find('D:href', namespaces)
+            resourcetype_elem = response_elem.find('.//D:resourcetype', namespaces)
+            
+            if href_elem is not None and resourcetype_elem is not None:
+                href = href_elem.text
+                # Skip directories (they contain D:collection element)
+                collection_elem = resourcetype_elem.find('D:collection', namespaces)
+                
+                if collection_elem is None and href:  # It's a file, not a directory
+                    # Skip the base directory itself
+                    if not href.endswith('/'):
+                        file_urls.append(href)
+        
+        return file_urls
+        
+    except Exception as e:
+        print(f"Error listing directory contents: {e}")
+        return []
+
+
+def delete_webdav_file(file_url, auth):
+    """
+    Delete a single file from WebDAV server.
+    
+    Args:
+        file_url (str): Full URL of the file to delete
+        auth: HTTPBasicAuth object
+        
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    try:
+        response = requests.delete(file_url, auth=auth, timeout=30)
+        
+        if response.status_code in [200, 204, 404]:  # 404 means already deleted
+            return True
+        else:
+            print(f"Failed to delete {file_url} - Status: {response.status_code}")
+            return False
+            
+    except Exception as e:
+        print(f"Error deleting {file_url}: {e}")
+        return False
+
+
+def clear_webdav_directory():
+    """
+    Clear all files from the WebDAV directory.
+    
+    Returns:
+        bool: True if successful, False otherwise
+    """
+    # Get WebDAV configuration
+    webdav_config = get_webdav_config()
+    WEBDAV_BASE_URL = webdav_config['base_url']
+    WEBDAV_USERNAME = webdav_config['username']
+    WEBDAV_PASSWORD = webdav_config['password']
+    
+    auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
+    
+    print("Clearing WebDAV directory...")
+    
+    # Get list of existing files
+    file_urls = get_webdav_directory_contents(WEBDAV_BASE_URL, auth)
+    
+    if not file_urls:
+        print("No files found in WebDAV directory (or could not list contents)")
+        return True
+    
+    print(f"Found {len(file_urls)} files to delete")
+    
+    deleted_count = 0
+    failed_count = 0
+    
+    for file_url in file_urls:
+        # Convert relative URL to absolute if needed
+        if file_url.startswith('/'):
+            # Extract the base domain from WEBDAV_BASE_URL
+            from urllib.parse import urlparse
+            parsed_base = urlparse(WEBDAV_BASE_URL)
+            full_url = f"{parsed_base.scheme}://{parsed_base.netloc}{file_url}"
+        else:
+            full_url = file_url
+        
+        # Extract filename for display
+        filename = file_url.split('/')[-1]
+        
+        if delete_webdav_file(full_url, auth):
+            print(f"✓ Deleted: {filename}")
+            deleted_count += 1
+        else:
+            print(f"✗ Failed to delete: {filename}")
+            failed_count += 1
+    
+    print(f"\nCleanup Summary:")
+    print(f"✓ Files deleted: {deleted_count}")
+    print(f"✗ Failed deletions: {failed_count}")
+    
+    return failed_count == 0
+
+
+def upload_files_to_webdav(file_paths):
+    """
+    Upload multiple files to WebDAV server using preconfigured settings.
+    
+    Args:
+        file_paths (list): List of local file paths to upload
+    
+    Returns:
+        tuple: (successful_uploads, failed_uploads)
+    """
+    # Get WebDAV configuration
+    webdav_config = get_webdav_config()
+    WEBDAV_BASE_URL = webdav_config['base_url']
+    WEBDAV_USERNAME = webdav_config['username']
+    WEBDAV_PASSWORD = webdav_config['password']
+    
+    successful_uploads = []
+    failed_uploads = []
+    
+    for file_path in file_paths:
+        try:
+            # Get the filename from the local path
+            filename = Path(file_path).name
+            
+            # Construct the full WebDAV URL including the filename
+            webdav_file_url = f"{WEBDAV_BASE_URL}/{filename}"
+            
+            # Read the file content
+            with open(file_path, 'rb') as f:
+                file_content = f.read()
+            
+            # Prepare the request headers
+            headers = {
+                'Content-Type': 'text/markdown',
+                'Content-Length': str(len(file_content))
+            }
+            
+            # Set up authentication
+            auth = HTTPBasicAuth(WEBDAV_USERNAME, WEBDAV_PASSWORD)
+            
+            print(f"Uploading: {filename}")
+            
+            # Make the PUT request to upload the file
+            response = requests.put(
+                webdav_file_url,
+                data=file_content,
+                headers=headers,
+                auth=auth,
+                timeout=30
+            )
+            
+            if response.status_code in [200, 201, 204]:
+                print(f"✓ Successfully uploaded: {filename}")
+                successful_uploads.append(file_path)
+            else:
+                print(f"✗ Upload failed for {filename} - Status: {response.status_code}")
+                print(f"  Response: {response.text}")
+                failed_uploads.append(file_path)
+                
+        except requests.exceptions.RequestException as e:
+            print(f"✗ Network error uploading {filename}: {e}")
+            failed_uploads.append(file_path)
+        except Exception as e:
+            print(f"✗ Unexpected error uploading {filename}: {e}")
+            failed_uploads.append(file_path)
+    
+    return successful_uploads, failed_uploads
+
+
+def main():
+    """Main function to run the converter."""
+    parser = argparse.ArgumentParser(
+        description="Convert Kindle My Clippings.txt to separate Markdown files for each book"
+    )
+    parser.add_argument(
+        'input_file',
+        nargs='?',
+        default='My Clippings.txt',
+        help='Path to the My Clippings.txt file (default: My Clippings.txt)'
+    )
+    parser.add_argument(
+        '-d', '--output-dir',
+        default='kindle_books',
+        help='Output directory for markdown files (default: kindle_books)'
+    )
+    parser.add_argument(
+        '--upload',
+        action='store_true',
+        help='Upload the markdown files to WebDAV server'
+    )
+    parser.add_argument(
+        '--no-clear',
+        action='store_true',
+        help='Skip clearing WebDAV directory before upload (files will be added/overwritten)'
+    )
+    
+    args = parser.parse_args()
+    
+    # Check if input file exists
+    input_path = Path(args.input_file)
+    if not input_path.exists():
+        print(f"Error: Input file '{args.input_file}' not found.")
+        return 1
+    
+    try:
+        # Parse the clippings file
+        print(f"Parsing {args.input_file}...")
+        books = parse_clippings_file(args.input_file)
+        
+        # Generate markdown files
+        print("Generating markdown files...")
+        generated_files = generate_markdown_files(books, args.output_dir)
+        
+        if not generated_files:
+            print("No books with citations found.")
+            return 0
+        
+        # Upload to WebDAV if requested
+        if args.upload:
+            # Clear WebDAV directory first (unless --no-clear is specified)
+            if not args.no_clear:
+                if not clear_webdav_directory():
+                    print("Warning: Some files could not be deleted from WebDAV directory.")
+                    print("Continuing with upload (files will be overwritten)...")
+                print()  # Empty line for better readability
+            
+            print("Uploading to WebDAV server...")
+            
+            successful, failed = upload_files_to_webdav(generated_files)
+            
+            print(f"\nUpload Summary:")
+            print(f"✓ Successful uploads: {len(successful)}")
+            print(f"✗ Failed uploads: {len(failed)}")
+            
+            if failed:
+                print("Warning: Some files failed to upload.")
+                return 1
+        
+        return 0
+        
+    except Exception as e:
+        print(f"Error processing file: {e}")
+        return 1
+
+
+if __name__ == "__main__":
+    exit(main())