#!/usr/bin/python
# coding: utf-8

"""
Hard West Turn, 2019-06-04, 2019 version

Copyright (c) 2019 Nick Montfort <nickm@nickm.com>

Copying and distribution of this file, with or without modification, are
permitted in any medium without royalty provided the copyright notice and
this notice are preserved. This file is offered as-is, without any warranty.

Code as of July 2, when it was run to generate Hard West Turn 2019. This
docstring updated July 4.

This code relies on the existence of a particular table, populated with
links to articles, in a particular English Wikipedia article. If this
article is deleted or changed, or if the ID of this table changed (as has
happened twice since the earliest version of this program was written and run),
this code will not work as is.

I have made some edits in this 2019 version and reduced the length of the
hand-written texts which constitute the first paragraph, the first sentence of
each paragraph, and the end of some paragraphs.

I have also rewritten this 2019 version to use spaCy instead of TextBlob. This
version uses POS tagging (which could have been done in TextBlob, too) to
identify proper nouns at the beginning of sentences with reasonable, although
not perfect, accuracy. Detection of sentenece-initial proper nouns seems to be
significantly improved.

However, this version leaves some parts of notes at the end of sentences;
that is, things like '[33'. It also does not segment some other sentences
properly, including some which actually contain proper nouns in the original
source and remain, in partial form, in the output. I removed either extra
characters or long parts of sentences (which have proper nouns/adjectives)
manually in the output this year, and will try again to deal with automating
this sort of cleanup in 2020. Other manual changes included correcting
spelling (and normalizing it to U.S. spelling) and punctuation. As always, no
content-related changes were made by hand.

-Nick Montfort, July 4, 2019
"""

import re
from random import choice, shuffle
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
import spacy

nlp = spacy.load("en_core_web_sm")
english = 'http://en.wikipedia.org'
simple = 'http://simple.wikipedia.org'
mass_shootings = english + '/wiki/Mass_shootings_in_the_United_States'
html = urlopen(mass_shootings).read()
soup = BeautifulSoup(html, 'lxml')
deadliest = soup.find('span', id='Deadliest_mass_shootings_since_1949').parent

incident = {}
links = []
text = ''
litany, simple_litany, degenerate_litany = [], [], []

for row in deadliest.find_next_sibling('table', class_='wikitable').find_all('tr')[1:]:
    cells = row.find_all(['td'])
    try:
        incident_name = cells[0].text
        incident_link = cells[0].find('a')['href']
    except IndexError:
        continue
    incident[incident_name] = incident_link

para_frames = [
'This man was given to thinking of events of national importance.',
'The man thought to himself a good deal.',
'Certain things resonated in the otherwise still mind of the man.',
'Without outward sign of it, the man sometimes had a swirl of thought.',
'The man did not escape the country or himself.',
'The man went to find something.',
'Some things were known with certainty.',
'Some things were beyond the man’s ken.',
'The man dreamed at night sometimes, remembering a sliver in the morning.',
'The man may have never dreamed.',
'To forget what had been taken away, the man tried to fix his thoughts.',
'The man had regrets.',
'The man took things moment by moment.',
'The man remembered a lot of things.',
'The man knew that some things said were fake, some facts.',
'The man knew that people said things, sometimes for no reason.',
'The man had heard a book’s worth about the events.',
'The man had many thoughts, few of them clear.',
'The man knew what he knew.',
'The man had heard things.'
]

declarations = [
'The man had a tendency to watch, listen, and say little.',
"There was no avoiding the television's declarations.",
'The man preferred to eat alone, facing a window.',
'The man still remembered, imagined.',
'The man remained able to say all the words he needed to persist.',
'The man would sometimes whittle.',
'The man carried an envelope of remembrances, sealed by pressure.',
'The man believed in opportunity.',
'The man once caught himself tapping his foot without music playing.',
'There was no time at which the man saw anything suspicious.',
'The man knew his place.'
]

simple_declarations = [
'It was a simple time.',
'Little restrained the man.',
"The man's world was dim.",
'The man loved his country.',
'Freedom was most important.',
'The man felt he was free.',
'Success was still elusive.',
'The man required nothing.',
'Here, the man could range.',
'The hoped for a lack of news.',
'There was nothing to be said.',
'The man was who he chose to be.'
]

with_truck = [
'He drove off in his truck',
'He got into his truck',
'His truck carried him',
'His truck was still running',
'He went off in his truck'
]

no_truck = [
'He got onto a long-distance bus',
'He gathered funds for a bus ticket',
'He managed to hitchhike',
'It was still possible to slip into a freight train',
'He was able to walk and camp along the way'
]

laborer_job = [
'as an itinerant locksmith',
'as a night watchman',
'as a mover',
'as a day laborer',
'as a scab longshoreman',
'as a greeter'
]

unpleasant_job = [
'cleaning rough industrial spaces',
'as a dishwasher',
'as a warehouse picker',
'collecting recyclables',
'as a lookout',
'in a slaughterhouse'
]

def all_lowercase(sent):
    """Returns a string version of the sentence, suitably modified, if it seems
    to have no proper nouns. If the only proper noun is at the beginning, that
    one is simply removed and the next word is capitalized."""
    sent_str = str(sent)
    if re.search(r'[a-z]', sent_str) and sent_str[0] == sent_str[0].upper() and \
                                     sent_str[1:] == sent_str[1:].lower():
        sent_str = re.sub(r'\[.*\]', '', sent_str)
        sent_str = re.sub(r'^\"$', '', sent_str)
        sent_str = re.sub(r'"', '', sent_str)
        sent_str = re.sub(r'\n', ' ', sent_str)
        sent_str = sent_str.strip()
        if len(sent_str.split()) > 2:
            if re.search(r'\:$', sent_str):
                return None
            if re.search(r'\)$', sent_str):
                return None
            if list(sent)[0].tag_ == 'NNP':
                sent_str = ' '.join(sent_str.split()[1:])
                sent_str = sent_str[0].upper() + sent_str[1:]
            return sent_str

for i in incident:
    article = english + incident[i]
    html = urlopen(article).read()
    soup = BeautifulSoup(html, 'lxml')
    content = soup.find('div', id='bodyContent')
    for p in content.find_all('p'):
        text += ' ' + p.getText()
    for a in content.find_all('a'):
        href = a.get('href')
        if href is not None:
            if ':' not in href and re.match(r'/wiki', href):
                links.append(href)

tokens = nlp(text.strip())
for sent in tokens.sents:
    string = all_lowercase(sent)
    if string is not None:
        litany.append(string)

for count, rel_url in sorted(((links.count(e), e) for e in set(links)), reverse=True):
    if 1 < count < 14:
        article = simple + rel_url
        try:
            html = urlopen(article).read()
            soup = BeautifulSoup(html, 'lxml')
            content = soup.find('div', id='bodyContent')
            simple_text = ''
            for p in content.find_all('p'):
                simple_text += ' ' + p.getText()
            simple_tokens = nlp(simple_text.strip())
            for sent in simple_tokens.sents:
                string = all_lowercase(sent)
                if string is not None:
                    simple_litany.append(string)
        except HTTPError:
            continue

def add_to_degenerate(string):
    if ',' in string and re.findall(r'\(', string) == \
                         re.findall(r'\)', string):
        string = string.split(',')[0] + '.'
        if string[-3:] == 'm..': # Sentences ending "a.m.." and "p.m.."
            string = string[:-1]
        degenerate_litany.append(string)

for string in simple_litany:
    add_to_degenerate(string)
for string in litany:
    add_to_degenerate(string)
for string in degenerate_litany:
    if ' ' in string and len(string.split()) < 5 and \
                     ',' not in string and '(' not in string:
        degenerate_litany.append(string[:-1] + ', ' + string.lower())

def print_part(statements, declare, travel, job):
    'Prints one of three parts of the book.'
    shuffle(statements)
    tenth = int( len(statements) / 10 )
    next_para = 0
    for n in range(10):
        para = '  ' + choice(para_frames)
        for j in range(tenth):
            para += ' ' + statements[next_para + j]
            para += choice(['', '', '', '', '\n  ' + choice(para_frames)])
        print(para)
        next_para += tenth
        sentence = choice(declare)
        declare.remove(sentence)
        if len(sentence) > 0:
            sentence += ' '
        final_sentence = '  ' + sentence + choice(travel)
        if len(job) > 0:
            final_sentence += ' and he found work ' + choice(job)
        final_sentence += '.'
        print(final_sentence)

print('Hard West Turn')
print('Nick Montfort')
print('')
print('Copyright © Nick Montfort 2019; July 4, 2019 edition')
print('')
print('This output text was computer-generated using text from the English Wikipedia, en.wikipedia.org, and the Simple English Wikipedia, simple.wikipedia.org. This source text is offered under the CC-BY-SA 3.0 Unported license, so this generated text is offered under the same license.')
print('')
print(u'And the LORD said unto Satan, Whence comest thou? Then Satan answered the LORD, and said, From going to and fro in the earth, and from walking up and down in it. And the LORD said unto Satan, Hast thou considered my servant Job, that there is none like him in the earth, a perfect and an upright man, one that feareth God, and escheweth evil? Then Satan answered the LORD, and said, Doth Job fear God for nought? Hast not thou made an hedge about him, and about his house, and about all that he hath on every side? thou hast blessed the work of his hands, and his substance is increased in the land. But put forth thine hand now, and touch all that he hath, and he will curse thee to thy face.')
print('')
print(u'Near the eastern shore of the great nation a man returned to find his family departed from the house their bank was set to seize. He thought on this, the television telling news, his little remaining liquor and less food dwindling. In four days he set off.')
print_part(litany, declarations, with_truck, laborer_job)
print('')
print('•')
print('')
print_part(simple_litany, simple_declarations, no_truck, unpleasant_job)
print('')
print('•')
print('')
for pf in para_frames:
    if len(pf) > 38:
        para_frames.remove(pf)
print_part(degenerate_litany, ['']*12, no_truck, [])
print('')
print('Spelling and punctuation corrections were made, with U.S. spellings used throughout. Sentences beginning with a single proper noun have had these words manually removed. If proper nouns or adjectives occurred elsewhere in a sentence, the sentence was removed. No other changes were made by hand to the computer-generated output.')
print('The program that generated this book is offered under a permissive free software license. Download it from nickm.com and generate your own book, modify the code, or do whatever else you like with the program:')
print('https://nickm.com/code/hard_west_turn_2019.py')
print('The program was first drafted for National Novel Generation Month 2017:')
print('https://nanogenmo.github.io/')
