2018-10-08 18:11:51 -07:00
#!/usr/bin/env python3
2021-06-15 18:29:53 -07:00
# SPDX-License-Identifier: EUPL-1.2
2018-10-08 18:11:51 -07:00
import markovify
2019-01-11 04:47:42 -08:00
from bs4 import BeautifulSoup
2021-06-04 14:14:56 -07:00
from random import randint
2019-08-06 20:46:57 -07:00
import re , multiprocessing , sqlite3 , shutil , os , html
2018-10-08 18:11:51 -07:00
2021-06-04 14:38:36 -07:00
2019-08-06 20:46:57 -07:00
def make_sentence ( output , cfg ) :
2021-06-04 14:38:36 -07:00
class nlt_fixed ( markovify . NewlineText ) : # modified version of NewlineText that never rejects sentences
2018-10-08 18:11:51 -07:00
def test_sentence_input ( self , sentence ) :
2021-06-04 14:38:36 -07:00
return True # all sentences are valid <3
2018-10-08 18:11:51 -07:00
2021-06-04 14:38:36 -07:00
shutil . copyfile ( " toots.db " , " toots-copy.db " ) # create a copy of the database because reply.py will be using the main one
2018-10-08 18:11:51 -07:00
db = sqlite3 . connect ( " toots-copy.db " )
2019-02-25 10:30:40 -08:00
db . text_factory = str
2018-10-08 18:11:51 -07:00
c = db . cursor ( )
2019-02-24 17:17:06 -08:00
if cfg [ ' learn_from_cw ' ] :
2021-06-11 14:29:51 -07:00
ignored_cws_query_params = " ( " + " , " . join ( " ? " * len ( cfg [ " ignored_cws " ] ) ) + " ) "
2021-06-11 14:37:09 -07:00
toots = c . execute ( f " SELECT content FROM `toots` WHERE cw IS NULL OR CW NOT IN { ignored_cws_query_params } ORDER BY RANDOM() LIMIT 10000 " , cfg [ " ignored_cws " ] ) . fetchall ( )
2019-02-25 10:30:40 -08:00
else :
2021-06-11 14:29:51 -07:00
toots = c . execute ( " SELECT content FROM `toots` WHERE cw IS NULL ORDER BY RANDOM() LIMIT 10000 " ) . fetchall ( )
2018-10-08 18:11:51 -07:00
2020-03-08 01:46:07 -08:00
if len ( toots ) == 0 :
2019-07-10 04:25:07 -07:00
output . send ( " Database is empty! Try running main.py. " )
return
2020-05-27 05:31:16 -07:00
2021-06-04 14:14:56 -07:00
nlt = markovify . NewlineText if cfg [ ' overlap_ratio_enabled ' ] else nlt_fixed
model = nlt (
2020-03-08 01:46:07 -08:00
" \n " . join ( [ toot [ 0 ] for toot in toots ] )
)
2020-05-27 05:31:16 -07:00
2020-03-08 01:46:07 -08:00
db . close ( )
os . remove ( " toots-copy.db " )
2019-07-10 04:25:07 -07:00
2021-06-04 14:14:56 -07:00
if cfg [ ' limit_length ' ] :
sentence_len = randint ( cfg [ ' length_lower_limit ' ] , cfg [ ' length_upper_limit ' ] )
2018-10-08 18:11:51 -07:00
sentence = None
2018-10-28 18:23:01 -07:00
tries = 0
while sentence is None and tries < 10 :
2021-06-04 14:14:56 -07:00
sentence = model . make_short_sentence (
max_chars = 500 ,
tries = 10000 ,
max_overlap_ratio = cfg [ ' overlap_ratio ' ] if cfg [ ' overlap_ratio_enabled ' ] else 0.7 ,
max_words = sentence_len if cfg [ ' limit_length ' ] else None
)
2018-10-28 18:23:01 -07:00
tries = tries + 1
2019-01-11 04:47:42 -08:00
2019-04-28 21:21:46 -07:00
# optionally remove mentions
if cfg [ ' mention_handling ' ] == 1 :
2019-04-28 21:38:44 -07:00
sentence = re . sub ( r " ^ \ S*@ \ u200B \ S* \ s? " , " " , sentence )
2019-04-28 21:21:46 -07:00
elif cfg [ ' mention_handling ' ] == 0 :
2019-04-28 21:38:44 -07:00
sentence = re . sub ( r " \ S*@ \ u200B \ S* \ s? " , " " , sentence )
2019-01-11 04:47:42 -08:00
2018-10-08 18:11:51 -07:00
output . send ( sentence )
2021-06-04 14:38:36 -07:00
2019-08-06 20:46:57 -07:00
def make_toot ( cfg ) :
2018-10-08 18:11:51 -07:00
toot = None
2019-05-19 06:06:31 -07:00
pin , pout = multiprocessing . Pipe ( False )
2021-06-04 14:38:36 -07:00
p = multiprocessing . Process ( target = make_sentence , args = [ pout , cfg ] )
2019-05-19 06:06:31 -07:00
p . start ( )
2021-06-04 14:38:36 -07:00
p . join ( 5 ) # wait 5 seconds to get something
if p . is_alive ( ) : # if it's still trying to make a toot after 5 seconds
2019-05-19 06:06:31 -07:00
p . terminate ( )
p . join ( )
else :
toot = pin . recv ( )
2021-06-04 14:38:36 -07:00
if toot is None :
2019-01-11 04:16:04 -08:00
toot = " Toot generation failed! Contact Lynne (lynnesbian@fedi.lynnesbian.space) for assistance. "
2019-07-02 03:43:34 -07:00
return toot
2019-01-11 04:55:31 -08:00
2021-06-04 14:38:36 -07:00
2019-01-11 04:55:31 -08:00
def extract_toot ( toot ) :
2021-06-04 14:38:36 -07:00
toot = html . unescape ( toot ) # convert HTML escape codes to text
2019-01-11 04:55:31 -08:00
soup = BeautifulSoup ( toot , " html.parser " )
2021-06-04 14:38:36 -07:00
for lb in soup . select ( " br " ) : # replace <br> with linebreak
2021-02-18 08:01:43 -08:00
lb . name = " \n "
2019-01-11 04:55:31 -08:00
2021-06-04 14:38:36 -07:00
for p in soup . select ( " p " ) : # ditto for <p>
2021-02-18 08:01:43 -08:00
p . name = " \n "
2019-01-11 04:55:31 -08:00
2021-06-04 14:38:36 -07:00
for ht in soup . select ( " a.hashtag " ) : # convert hashtags from links to text
2019-01-11 04:55:31 -08:00
ht . unwrap ( )
2021-06-04 14:38:36 -07:00
for link in soup . select ( " a " ) : # convert <a href='https://example.com>example.com</a> to just https://example.com
2020-05-27 05:31:16 -07:00
if ' href ' in link :
# apparently not all a tags have a href, which is understandable if you're doing normal web stuff, but on a social media platform??
link . replace_with ( link [ " href " ] )
2019-01-11 04:55:31 -08:00
2019-01-11 04:56:35 -08:00
text = soup . get_text ( )
2021-06-04 14:38:36 -07:00
text = re . sub ( r " https://([^/]+)/(@[^ \ s]+) " , r " \ 2@ \ 1 " , text ) # put mastodon-style mentions back in
text = re . sub ( r " https://([^/]+)/users/([^ \ s/]+) " , r " @ \ 2@ \ 1 " , text ) # put pleroma-style mentions back in
text = text . rstrip ( " \n " ) # remove trailing newline(s)
2019-02-25 10:30:40 -08:00
return text