#!/usr/bin/env python
# vim:fileencoding=utf-8
'''
WSJ recipe that uses your real subscription via cookies exported from a
logged-in browser.

Setup (one time):
  1. Install a cookies.txt exporter extension in your browser:
       - Chrome: "Get cookies.txt LOCALLY"
       - Firefox: "cookies.txt"
  2. Log into https://www.wsj.com in that browser.
  3. Open the extension and export cookies for both wsj.com AND
     dowjones.com (or just "Export All").
  4. Save the resulting file as ~/.wsj_cookies.txt
       (or set the WSJ_COOKIES env var to a different path)

Cookies typically last several weeks. Re-export when articles start
showing the paywall again.
'''
import json
import os
import time
from http.cookiejar import MozillaCookieJar
from urllib.parse import quote, urlencode

from calibre.ebooks.BeautifulSoup import BeautifulSoup
from calibre.web.feeds.news import BasicNewsRecipe


def has_class(name):
    def match(v):
        if not v:
            return False
        classes = v if isinstance(v, (list, tuple)) else str(v).split()
        return name in classes
    return match


def class_contains(substr):
    def match(v):
        if not v:
            return False
        classes = v if isinstance(v, (list, tuple)) else str(v).split()
        return any(substr in c for c in classes)
    return match


class WSJSubscriber(BasicNewsRecipe):
    title = 'The Wall Street Journal'
    __author__ = 'The Wall Street Journal'
    publisher = 'The Wall Street Journal'
    publication_type = 'newspaper'
    category = 'News'
    tags = 'News'
    description = (
        'Print Edition of WSJ, fetched directly from wsj.com using cookies '
        'exported from a logged-in browser session.'
    )
    language = 'en_US'
    masthead_url = 'https://s.wsj.net/media/wsj_amp_masthead_lg.png'
    encoding = 'utf-8'
    no_javascript = True
    no_stylesheets = True
    remove_attributes = ['style', 'height', 'width']
    resolve_internal_links = True

    cookies_path = os.path.expanduser(
        os.environ.get('WSJ_COOKIES', '~/.wsj_cookies.txt')
    )

    recipe_specific_options = {
        'date': {
            'short': 'The date of the edition to download (YYYY-MM-DD)\nOnly the past 6 editions are available',
            'long':  'For example, 2026-05-03',
        },
        'res': {
            'short': 'Image width: 300, 400, 600, 800, 1000, 1200, 1500',
            'long':  'For e-ink keep this small. 400 suits a ~4" panel.',
            'default': '400',
        },
    }

    extra_css = '''
        img { max-width: 100%; height: auto; }
        figure, figcaption { page-break-inside: avoid; }
        #big-top-caption { font-size:small; text-align:center; }
        [data-type:"tagline"] { font-style:italic; color:#202020; }
        .auth, time { font-size:small; }
        .sub, em, i { color: #202020; }
    '''

    keep_only_tags = [
        dict(name='div', attrs={'class': has_class('article-container')}),
        dict(attrs={'aria-describedby': 'big-top-caption'}),
        dict(attrs={'id': 'big-top-caption'}),
    ]

    remove_tags = [
        dict(attrs={'data-type': ['inset', 'video']}),
        dict(attrs={'data-testid': 'ad-container'}),
        dict(attrs={'data-spotim-app': 'conversation'}),
        dict(name=['button', 'svg', 'old-script', 'video']),
        dict(attrs={'aria-label': [
            'Sponsored Offers', 'Listen To Article', 'What to Read Next',
            'Utility Bar', 'Conversation', 'List of Comments', 'Comment',
            'JR More Articles',
        ]}),
        dict(attrs={'data-spot-im-class': [
            'message-text', 'messages-list', 'message-view', 'conversation-root',
        ]}),
        dict(attrs={'id': lambda x: x and x.startswith((
            'comments_sector', 'wrapper-INLINE', 'audio-tag-inner-audio-',
            'article-comments-tool',
        ))}),
        dict(name='div', attrs={'data-message-depth': True}),
        dict(name='div', attrs={'class': has_class('print-header')}),
        dict(name='div', attrs={'class': has_class('aside-container')}),
        dict(attrs={'data-testid': ['panel-transition-container', 'accordion-content']}),
    ]

    # No archive.is bypass — we have a real subscription.
    articles_are_obfuscated = False

    def get_browser(self, *args, **kw):
        br = BasicNewsRecipe.get_browser(self, *args, **kw)
        br.set_handle_robots(False)
        br.addheaders = [
            ('User-Agent',
             'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) '
             'AppleWebKit/605.1.15 (KHTML, like Gecko) '
             'Version/17.5 Safari/605.1.15'),
            ('Accept',
             'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'),
            ('Accept-Language', 'en-US,en;q=0.9'),
            ('apollographql-client-name', 'wsj-mobile-android-release'),
        ]
        if os.path.exists(self.cookies_path):
            cj = MozillaCookieJar(self.cookies_path)
            cj.load(ignore_discard=True, ignore_expires=True)
            br.set_cookiejar(cj)
            self.log('Loaded WSJ cookies from', self.cookies_path)
        else:
            self.log.warn(
                'WSJ cookies file not found at', self.cookies_path,
                '— articles will hit the paywall. See the comment at the top of this recipe.'
            )
        return br

    def preprocess_html(self, soup):
        # WSJ embeds related-content boxes (Videos / Most Popular / Further
        # Reading) inside the article body. They're identified by a heading
        # whose class contains '-SectionLabel'; the surrounding StackWrapper
        # is the box we want to drop.
        for label in soup.findAll(
            ['h2', 'h3'], attrs={'class': class_contains('SectionLabel')}
        ):
            target, cur = label, label
            for _ in range(8):
                cur = cur.parent if cur is not None else None
                if cur is None:
                    break
                cls = cur.get('class') or []
                if any('StackWrapper' in c for c in cls):
                    target = cur
                    break
            target.extract()
        # Trailing copyright/legal blurb (e.g. "Copyright (c) 2026 Dow Jones...")
        for p in soup.findAll('p', attrs={'class': class_contains('e1jdulti')}):
            p.extract()
        res = '?width=400'
        w = self.recipe_specific_options.get('res')
        if w and isinstance(w, str):
            res = '?width=' + w
        for img in soup.findAll('img', attrs={'currentsourceurl': True}):
            img['src'] = img['currentsourceurl'].split('?')[0] + res
        # Most live wsj.com images use a normal src + srcset and have no
        # currentsourceurl. Strip srcset so the converter uses our chosen
        # src, and rewrite the src to the requested width.
        for img in soup.findAll('img', src=True):
            if 'srcset' in img.attrs:
                del img['srcset']
            src = img['src']
            if 'images.wsj.net' in src or 'wsj.net' in src:
                img['src'] = src.split('?')[0] + res
        for p in soup.findAll('div', attrs={'data-type': ['paragraph', 'image']}):
            p.name = 'p'
        for a in soup.findAll('a', href=True):
            a['href'] = 'http' + a['href'].split('http')[-1]
        for figc in soup.findAll('figcaption'):
            figc['id'] = 'big-top-caption'
        if name := soup.find('h2', attrs={'itemprop': 'name'}):
            name.extract()
        for h2 in soup.findAll('h2'):
            if self.tag_to_string(h2).startswith(('What to Read Next', 'Conversation')):
                h2.extract()
            h2.name = 'h3'
            h2['class'] = 'sub'
        for ph in soup.findAll('a', attrs={'data-type': ['phrase', 'link']}):
            if div := ph.findParent('div'):
                div.name = 'span'
        for auth in soup.findAll(
            'a', attrs={'aria-label': lambda x: x and x.startswith('Author page')}
        ):
            if div := auth.find_previous_sibling('div'):
                div.name = 'span'
            if parent := auth.findParent('div'):
                parent['class'] = 'auth'
        for x in soup.findAll('ufc-follow-author-widget'):
            if y := x.findParent('div'):
                y.extract()
        return soup

    def _download_cover(self):
        from datetime import datetime
        from io import BytesIO

        from PIL import Image, ImageDraw, ImageFont
        from calibre.utils.img import save_cover_data_to

        # macOS system fonts. Bodoni 72 Bold is the closest free face to
        # the WSJ masthead. Falls back through other serifs if missing.
        title_fonts = [
            ('/System/Library/Fonts/Supplemental/Bodoni 72.ttc', 2),
            ('/System/Library/Fonts/Supplemental/Didot.ttc', 2),
            ('/System/Library/Fonts/Supplemental/Hoefler Text.ttc', 1),
            ('/System/Library/Fonts/Supplemental/Times New Roman Bold.ttf', 0),
        ]
        date_fonts = [
            ('/System/Library/Fonts/Supplemental/Didot.ttc', 0),
            ('/System/Library/Fonts/Supplemental/Bodoni 72.ttc', 0),
            ('/System/Library/Fonts/Supplemental/Hoefler Text.ttc', 0),
            ('/System/Library/Fonts/Supplemental/Times New Roman.ttf', 0),
        ]

        def load_font(candidates, size):
            for path, idx in candidates:
                if os.path.exists(path):
                    try:
                        return ImageFont.truetype(path, size, index=idx)
                    except Exception:
                        continue
            return ImageFont.load_default()

        # Prefer the issue label set during parse_index, then explicit option, then today.
        date_str = getattr(self, 'cover_date_label', '') or ''
        if not date_str:
            opt = self.recipe_specific_options.get('date')
            if opt and isinstance(opt, str):
                try:
                    d, m, y = opt.split('-')
                    dt = datetime(int(y), int(m), int(d))
                    date_str = f'{dt.strftime("%A, %B")} {dt.day}, {dt.year}'
                except Exception:
                    pass
        if not date_str:
            now = datetime.now()
            date_str = f'{now.strftime("%A, %B")} {now.day}, {now.year}'

        W, H = 1200, 1600
        img = Image.new('RGB', (W, H), 'white')
        draw = ImageDraw.Draw(img)
        title_font = load_font(title_fonts, 560)
        date_font = load_font(date_fonts, 64)

        title = 'WSJ'
        t_ascent, t_descent = title_font.getmetrics()
        title_w = draw.textlength(title, font=title_font)
        title_x = int((W - title_w) // 2)
        title_y = (H - (t_ascent + t_descent)) // 2 - 80
        draw.text((title_x, title_y), title, font=title_font, fill='black')

        date_w = draw.textlength(date_str, font=date_font)
        date_x = int((W - date_w) // 2)
        date_y = title_y + t_ascent + t_descent + 60
        draw.text((date_x, date_y), date_str, font=date_font, fill='black')

        buf = BytesIO()
        img.save(buf, format='JPEG', quality=92)
        cpath = os.path.join(self.output_dir, 'cover.jpg')
        save_cover_data_to(buf.getvalue(), cpath)
        self.cover_path = cpath

    def parse_index(self):
        query = {
            'operationName': 'IssueQuery',
            'variables': '{"publication":"WSJ","region":"US","masthead":"ITPNEXTGEN"}',
            'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"d938226e7d1c1fff050e7d084c72179e2713dcf4736d3a442c618c55b896f847"}}',
        }
        url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
            query, safe='()!', quote_via=quote
        )
        raw = self.index_to_soup(url, raw=True)

        cat_data = json.loads(raw)['data']['mobileIssuesByMasthead']
        edit = [x['datedLabel'] for x in cat_data][1:]
        self.log('**Past Editions available : ' + ' | '.join(edit))

        past_edition = self.recipe_specific_options.get('date')

        # Stamp the date directly into self.title — calibre's timefmt is
        # ignored on Kindle output profiles (periodical_date_in_title=False),
        # so we can't rely on it.
        chosen = None
        for itm in cat_data:
            if past_edition and isinstance(past_edition, str):
                if past_edition in itm['publishedDateUtc']:
                    chosen = itm
                    break
                continue
            chosen = itm
            break
        if chosen is None:
            chosen = cat_data[0]
        self.cover_date_label = chosen['datedLabel']
        self.title = f'The Wall Street Journal - {chosen["datedLabel"]}'
        self.timefmt = ''
        sections_ = chosen['sections']

        self.log('Downloading ', self.timefmt)

        feeds = []
        for sec in sections_[:-1]:
            time.sleep(3)
            section = sec['label']
            self.log(section)
            cont_id = sec['key']

            query = {
                'operationName': 'SectionQuery',
                'variables': '{{"id":"{}"}}'.format(cont_id),
                'extensions': '{"persistedQuery":{"version":1,"sha256Hash":"207fe93376f379bf223ed2734cf9313a28291293366a803db923666fa6b45026"}}',
            }
            sec_url = 'https://shared-data.dowjones.io/gateway/graphql?' + urlencode(
                query, safe='()!', quote_via=quote
            )
            sec_raw = self.index_to_soup(sec_url, raw=True)

            sec_data = json.loads(sec_raw)['data']['summaryCollectionContent']['collectionItems']

            articles = []
            for art in sec_data:
                for arts in art['collectionItems']:
                    mobi = arts['content']['mobileSummary']
                    title = mobi['headline']['text']
                    try:
                        desc = mobi['description']['content']['text']
                    except TypeError:
                        desc = ''
                    url = arts['content']['sourceUrl']
                    self.log('          ', title, '\n\t', desc, '\n\t', url)
                    articles.append({'title': title, 'description': desc, 'url': url})
            feeds.append((section, articles))
        return feeds
