forked from KayFaraday/pleroma-ebooks
rewrite for anyio+aiohttp
This commit is contained in:
parent
93095f62f3
commit
5d1c3397b6
|
@ -11,4 +11,5 @@ __pycache__/
|
||||||
.editorconfig
|
.editorconfig
|
||||||
.*.swp
|
.*.swp
|
||||||
config.json
|
config.json
|
||||||
|
config.toml
|
||||||
venv/
|
venv/
|
||||||
|
|
|
@ -48,13 +48,12 @@ mstdn-ebooks uses ActivityPub to download posts. This means that it is not depen
|
||||||
I recommend that you create your bot's account on a Mastodon instance. Creating a bot on a Pleroma instance means that your bot will be unable to reply, although posting will work just fine. However, even if your bot is on a Mastodon instance, it will be able to learn from any Pleroma or Misskey users just fine.
|
I recommend that you create your bot's account on a Mastodon instance. Creating a bot on a Pleroma instance means that your bot will be unable to reply, although posting will work just fine. However, even if your bot is on a Mastodon instance, it will be able to learn from any Pleroma or Misskey users just fine.
|
||||||
|
|
||||||
## Configuration
|
## Configuration
|
||||||
Configuring mstdn-ebooks is accomplished by editing `config.json`. If you want to use a different file for configuration, specify it with the `--cfg` argument. For example, if you want to use `/home/lynne/c.json` instead, you would run `python3 main.py --cfg /home/lynne/c.json` instead of just `python3 main.py`
|
Configuring mstdn-ebooks is accomplished by editing `config.toml`. If you want to use a different file for configuration, specify it with the `--cfg` argument. For example, if you want to use `/home/lynne/c.json` instead, you would run `python3 main.py --cfg /home/lynne/c.json` instead of just `python3 main.py`
|
||||||
|
|
||||||
| Setting | Default | Meaning |
|
| Setting | Default | Meaning |
|
||||||
|--------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
|--------------------------|-----------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||||
| site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) |
|
| site | https://botsin.space | The instance your bot will log in to and post from. This must start with `https://` or `http://` (preferably the latter) |
|
||||||
| cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. |
|
| cw | null | The content warning (aka subject) mstdn-ebooks will apply to non-error posts. |
|
||||||
| instance_blacklist | ["bofa.lol", "witches.town", "knzk.me"] | If your bot is following someone from a blacklisted instance, it will skip over them and not download their posts. This is useful for ensuring that mstdn-ebooks doesn't waste time trying to download posts from dead instances, without you having to unfollow the user(s) from them. |
|
|
||||||
| learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. |
|
| learn_from_cw | false | If true, mstdn-ebooks will learn from CW'd posts. |
|
||||||
| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CWs.
|
| ignored_cws | [] | If `learn_from_cw` is true, do not learn from posts with these CWs.
|
||||||
| mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). |
|
| mention_handling | 1 | 0: Never use mentions. 1: Only generate fake mentions in the middle of posts, never at the start. 2: Use mentions as normal (old behaviour). |
|
||||||
|
|
|
@ -0,0 +1,15 @@
|
||||||
|
{
|
||||||
|
"site": "https://botsin.space",
|
||||||
|
"cw": null,
|
||||||
|
"learn_from_cw": false,
|
||||||
|
"ignored_cws": [],
|
||||||
|
"mention_handling": 1,
|
||||||
|
"max_thread_length": 15,
|
||||||
|
"strip_paired_punctuation": false,
|
||||||
|
"limit_length": false,
|
||||||
|
"length_lower_limit": 5,
|
||||||
|
"length_upper_limit": 50,
|
||||||
|
"overlap_ratio_enabled": false,
|
||||||
|
"overlap_ratio": 0.7,
|
||||||
|
"access_token": ""
|
||||||
|
}
|
45
functions.py
45
functions.py
|
@ -1,11 +1,49 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: EUPL-1.2
|
# SPDX-License-Identifier: EUPL-1.2
|
||||||
|
|
||||||
|
import re
|
||||||
|
import os
|
||||||
|
import html
|
||||||
|
import json
|
||||||
|
import shutil
|
||||||
|
import sqlite3
|
||||||
|
import argparse
|
||||||
import markovify
|
import markovify
|
||||||
from bs4 import BeautifulSoup
|
import multiprocessing
|
||||||
|
import pytomlpp as toml
|
||||||
from random import randint
|
from random import randint
|
||||||
import re, multiprocessing, sqlite3, shutil, os, html
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
def arg_parser_factory(*, description):
|
||||||
|
parser = argparse.ArgumentParser(description=description)
|
||||||
|
parser.add_argument(
|
||||||
|
'-c', '--cfg', dest='cfg', default='config.toml', nargs='?',
|
||||||
|
help='Specify a custom location for the config file.'
|
||||||
|
)
|
||||||
|
return parser
|
||||||
|
|
||||||
|
def parse_args(*, description):
|
||||||
|
return arg_parser_factory(description=description).parse_args()
|
||||||
|
|
||||||
|
def load_config(cfg_path):
|
||||||
|
# TOML doesn't support null here so we have to use JSON 😒
|
||||||
|
with open('config.defaults.json') as f:
|
||||||
|
cfg = json.load(f)
|
||||||
|
|
||||||
|
with open(cfg_path) as f:
|
||||||
|
cfg.update(toml.load(f))
|
||||||
|
|
||||||
|
if not cfg['site'].startswith('https://') and not cfg['site'].startswith('http://'):
|
||||||
|
print("Site must begin with 'https://' or 'http://'. Value '{0}' is invalid - try 'https://{0}' instead.".format(cfg['site']), file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if 'access_token' not in cfg:
|
||||||
|
print('No authentication info', file=sys.stderr)
|
||||||
|
print('Get a client id, client secret, and access token here: https://tinysubversions.com/notes/mastodon-bot/', file=sys.stderr)
|
||||||
|
print('Then put `access_token` in your config file.', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
return cfg
|
||||||
|
|
||||||
def make_sentence(output, cfg):
|
def make_sentence(output, cfg):
|
||||||
class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences
|
class nlt_fixed(markovify.NewlineText): # modified version of NewlineText that never rejects sentences
|
||||||
|
@ -57,7 +95,6 @@ def make_sentence(output, cfg):
|
||||||
|
|
||||||
output.send(sentence)
|
output.send(sentence)
|
||||||
|
|
||||||
|
|
||||||
def make_toot(cfg):
|
def make_toot(cfg):
|
||||||
toot = None
|
toot = None
|
||||||
pin, pout = multiprocessing.Pipe(False)
|
pin, pout = multiprocessing.Pipe(False)
|
||||||
|
@ -71,7 +108,7 @@ def make_toot(cfg):
|
||||||
toot = pin.recv()
|
toot = pin.recv()
|
||||||
|
|
||||||
if toot is None:
|
if toot is None:
|
||||||
toot = "Toot generation failed! Contact Lynne (lynnesbian@fedi.lynnesbian.space) for assistance."
|
toot = 'Toot generation failed! Contact io@csdisaster.club for assistance.'
|
||||||
return toot
|
return toot
|
||||||
|
|
||||||
|
|
||||||
|
|
55
gen.py
55
gen.py
|
@ -1,42 +1,43 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: EUPL-1.2
|
# SPDX-License-Identifier: EUPL-1.2
|
||||||
|
|
||||||
from mastodon import Mastodon
|
import re
|
||||||
import argparse, json, re
|
|
||||||
import functions
|
import functions
|
||||||
|
from pleroma import Pleroma
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Generate and post a toot.')
|
def parse_args():
|
||||||
parser.add_argument(
|
parser = functions.arg_parser_factory(description='Generate and post a toot.')
|
||||||
'-c', '--cfg', dest='cfg', default='config.json', nargs='?',
|
parser.add_argument(
|
||||||
help="Specify a custom location for config.json.")
|
'-s', '--simulate', dest='simulate', action='store_true',
|
||||||
parser.add_argument(
|
help="Print the toot without actually posting it. Use this to make sure your bot's actually working.")
|
||||||
'-s', '--simulate', dest='simulate', action='store_true',
|
return parser.parse_args()
|
||||||
help="Print the toot without actually posting it. Use this to make sure your bot's actually working.")
|
|
||||||
|
|
||||||
args = parser.parse_args()
|
async def main():
|
||||||
|
args = parse_args()
|
||||||
|
cfg = functions.load_config(args.cfg)
|
||||||
|
|
||||||
cfg = json.load(open(args.cfg))
|
|
||||||
|
|
||||||
client = None
|
|
||||||
|
|
||||||
if not args.simulate:
|
|
||||||
client = Mastodon(
|
|
||||||
client_id=cfg['client']['id'],
|
|
||||||
client_secret=cfg['client']['secret'],
|
|
||||||
access_token=cfg['secret'],
|
|
||||||
api_base_url=cfg['site'])
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
toot = functions.make_toot(cfg)
|
toot = functions.make_toot(cfg)
|
||||||
if cfg['strip_paired_punctuation']:
|
if cfg['strip_paired_punctuation']:
|
||||||
toot = re.sub(r"[\[\]\(\)\{\}\"“”«»„]", "", toot)
|
toot = re.sub(r"[\[\]\(\)\{\}\"“”«»„]", "", toot)
|
||||||
if not args.simulate:
|
if not args.simulate:
|
||||||
try:
|
async with Pleroma(api_base_url=cfg['site'], access_token=cfg['access_token']) as pl:
|
||||||
client.status_post(toot, visibility='unlisted', spoiler_text=cfg['cw'])
|
try:
|
||||||
except Exception:
|
await pl.post(toot, visibility='unlisted', cw=cfg['cw'])
|
||||||
toot = "An error occurred while submitting the generated post. Contact lynnesbian@fedi.lynnesbian.space for assistance."
|
except Exception:
|
||||||
client.status_post(toot, visibility='unlisted', spoiler_text="Error!")
|
import traceback
|
||||||
|
toot = (
|
||||||
|
'An error occurred while submitting the generated post. '
|
||||||
|
'Contact io@csdisaster.club for assistance. Full traceback:\n\n'
|
||||||
|
+ traceback.format_exc()
|
||||||
|
)
|
||||||
|
await pl.status_post(toot, visibility='unlisted', cw='Error!')
|
||||||
|
raise
|
||||||
|
|
||||||
try:
|
try:
|
||||||
print(toot)
|
print(toot)
|
||||||
except UnicodeEncodeError:
|
except UnicodeEncodeError:
|
||||||
print(toot.encode("ascii", "ignore")) # encode as ASCII, dropping any non-ASCII characters
|
print(toot.encode("ascii", "ignore")) # encode as ASCII, dropping any non-ASCII characters
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
import anyio
|
||||||
|
anyio.run(main)
|
||||||
|
|
421
main.py
421
main.py
|
@ -1,263 +1,224 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# SPDX-License-Identifier: EUPL-1.2
|
# SPDX-License-Identifier: EUPL-1.2
|
||||||
|
|
||||||
from mastodon import Mastodon, MastodonUnauthorizedError
|
import re
|
||||||
import sqlite3, signal, sys, json, re, argparse
|
import sys
|
||||||
import requests
|
import json
|
||||||
|
import anyio
|
||||||
|
import asqlite
|
||||||
|
import sqlite3
|
||||||
|
import asyncio
|
||||||
|
import aiohttp
|
||||||
|
import argparse
|
||||||
import functions
|
import functions
|
||||||
|
import contextlib
|
||||||
|
import pytomlpp as toml
|
||||||
|
from http import HTTPStatus
|
||||||
|
from pleroma import Pleroma, http_session_factory
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(description='Log in and download posts.')
|
PATTERNS = {
|
||||||
parser.add_argument(
|
"handle": re.compile(r'^.*@(.+)'),
|
||||||
'-c', '--cfg', dest='cfg', default='config.json', nargs='?',
|
"base_url": re.compile(r'https?:\/\/(.*)'),
|
||||||
help="Specify a custom location for config.json.")
|
"webfinger_template_url": re.compile(r'template="([^"]+)"'),
|
||||||
|
"post_id": re.compile(r'[^\/]+$'),
|
||||||
args = parser.parse_args()
|
|
||||||
|
|
||||||
scopes = ["read:statuses", "read:accounts", "read:follows", "write:statuses", "read:notifications", "write:accounts"]
|
|
||||||
# cfg defaults
|
|
||||||
|
|
||||||
cfg = {
|
|
||||||
"site": "https://botsin.space",
|
|
||||||
"cw": None,
|
|
||||||
"instance_blacklist": ["bofa.lol", "witches.town", "knzk.me"], # rest in piece
|
|
||||||
"learn_from_cw": False,
|
|
||||||
"mention_handling": 1,
|
|
||||||
"max_thread_length": 15,
|
|
||||||
"strip_paired_punctuation": False,
|
|
||||||
"limit_length": False,
|
|
||||||
"length_lower_limit": 5,
|
|
||||||
"length_upper_limit": 50,
|
|
||||||
"overlap_ratio_enabled": False,
|
|
||||||
"overlap_ratio": 0.7,
|
|
||||||
"ignored_cws": [],
|
|
||||||
}
|
}
|
||||||
|
|
||||||
try:
|
@contextlib.asynccontextmanager
|
||||||
cfg.update(json.load(open(args.cfg, 'r')))
|
async def get_db():
|
||||||
except FileNotFoundError:
|
async with asqlite.connect('toots.db') as conn:
|
||||||
open(args.cfg, "w").write("{}")
|
async with conn.cursor() as cur:
|
||||||
|
await cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS toots (
|
||||||
|
sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT,
|
||||||
|
id VARCHAR NOT NULL,
|
||||||
|
cw VARCHAR,
|
||||||
|
userid VARCHAR NOT NULL,
|
||||||
|
uri VARCHAR NOT NULL,
|
||||||
|
content VARCHAR NOT NULL
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
await cur.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS cursors (
|
||||||
|
userid VARCHAR PRIMARY KEY,
|
||||||
|
next_page VARCHAR NOT NULL
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
await cur.execute("""
|
||||||
|
CREATE TRIGGER IF NOT EXISTS dedup
|
||||||
|
AFTER INSERT ON toots
|
||||||
|
FOR EACH ROW BEGIN
|
||||||
|
DELETE FROM toots
|
||||||
|
WHERE rowid NOT IN (
|
||||||
|
SELECT MIN(sortid)
|
||||||
|
FROM toots GROUP BY uri
|
||||||
|
);
|
||||||
|
END
|
||||||
|
""")
|
||||||
|
await conn.commit()
|
||||||
|
yield conn
|
||||||
|
|
||||||
print("Using {} as configuration file".format(args.cfg))
|
async def main():
|
||||||
|
args = functions.parse_args(description='Log in and download posts.')
|
||||||
|
cfg = functions.load_config(args.cfg)
|
||||||
|
|
||||||
if not cfg['site'].startswith("https://") and not cfg['site'].startswith("http://"):
|
async with (
|
||||||
print("Site must begin with 'https://' or 'http://'. Value '{}' is invalid - try 'https://{}' instead.".format(cfg['site']))
|
Pleroma(api_base_url=cfg['site'], access_token=cfg['access_token']) as client,
|
||||||
sys.exit(1)
|
get_db() as db, db.cursor() as cur,
|
||||||
|
http_session_factory() as http,
|
||||||
|
):
|
||||||
|
try:
|
||||||
|
following = await client.following()
|
||||||
|
except aiohttp.ClientResponseError as exc:
|
||||||
|
if exc.code == HTTPStatus.FORBIDDEN:
|
||||||
|
print(f'The provided access token in {args.cfg} is invalid.', file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
if "client" not in cfg:
|
async with anyio.create_task_group() as tg:
|
||||||
print("No application info -- registering application with {}".format(cfg['site']))
|
for acc in following:
|
||||||
client_id, client_secret = Mastodon.create_app(
|
tg.start_soon(fetch_posts, cfg, http, cur, acc)
|
||||||
"mstdn-ebooks",
|
|
||||||
api_base_url=cfg['site'],
|
|
||||||
scopes=scopes,
|
|
||||||
website="https://github.com/Lynnesbian/mstdn-ebooks")
|
|
||||||
|
|
||||||
cfg['client'] = {
|
print('Done!')
|
||||||
"id": client_id,
|
|
||||||
"secret": client_secret
|
|
||||||
}
|
|
||||||
|
|
||||||
if "secret" not in cfg:
|
await db.commit()
|
||||||
print("No user credentials -- logging in to {}".format(cfg['site']))
|
await db.execute('VACUUM') # compact db
|
||||||
client = Mastodon(
|
await db.commit()
|
||||||
client_id=cfg['client']['id'],
|
|
||||||
client_secret=cfg['client']['secret'],
|
|
||||||
api_base_url=cfg['site'])
|
|
||||||
|
|
||||||
print("Open this URL and authenticate to give mstdn-ebooks access to your bot's account: {}".format(client.auth_request_url(scopes=scopes)))
|
async def fetch_posts(cfg, http, cur, acc):
|
||||||
cfg['secret'] = client.log_in(code=input("Secret: "), scopes=scopes)
|
next_page = await (await cur.execute('SELECT next_page FROM cursors WHERE userid = ?', (acc['id'],))).fetchone()
|
||||||
|
|
||||||
json.dump(cfg, open(args.cfg, "w+"))
|
|
||||||
|
|
||||||
|
|
||||||
def extract_toot(toot):
|
|
||||||
toot = functions.extract_toot(toot)
|
|
||||||
toot = toot.replace("@", "@\u200B") # put a zws between @ and username to avoid mentioning
|
|
||||||
return(toot)
|
|
||||||
|
|
||||||
|
|
||||||
def get(*args, **kwargs):
|
|
||||||
r = requests.get(*args, **kwargs)
|
|
||||||
r.raise_for_status()
|
|
||||||
return r
|
|
||||||
|
|
||||||
|
|
||||||
client = Mastodon(
|
|
||||||
client_id=cfg['client']['id'],
|
|
||||||
client_secret=cfg['client']['secret'],
|
|
||||||
access_token=cfg['secret'],
|
|
||||||
api_base_url=cfg['site'])
|
|
||||||
|
|
||||||
try:
|
|
||||||
me = client.account_verify_credentials()
|
|
||||||
except MastodonUnauthorizedError:
|
|
||||||
print("The provided access token in {} is invalid. Please delete {} and run main.py again.".format(args.cfg, args.cfg))
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
following = client.account_following(me.id)
|
|
||||||
|
|
||||||
db = sqlite3.connect("toots.db")
|
|
||||||
db.text_factory = str
|
|
||||||
c = db.cursor()
|
|
||||||
c.execute("CREATE TABLE IF NOT EXISTS toots (sortid INTEGER UNIQUE PRIMARY KEY AUTOINCREMENT, id VARCHAR NOT NULL, cw VARCHAR, userid VARCHAR NOT NULL, uri VARCHAR NOT NULL, content VARCHAR NOT NULL)")
|
|
||||||
c.execute("CREATE TABLE IF NOT EXISTS cursors (userid VARCHAR PRIMARY KEY, next_page VARCHAR NOT NULL)")
|
|
||||||
c.execute("CREATE TRIGGER IF NOT EXISTS dedup AFTER INSERT ON toots FOR EACH ROW BEGIN DELETE FROM toots WHERE rowid NOT IN (SELECT MIN(sortid) FROM toots GROUP BY uri ); END; ")
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
|
|
||||||
def handleCtrlC(signal, frame):
|
|
||||||
print("\nPREMATURE EVACUATION - Saving chunks")
|
|
||||||
db.commit()
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
signal.signal(signal.SIGINT, handleCtrlC)
|
|
||||||
|
|
||||||
patterns = {
|
|
||||||
"handle": re.compile(r"^.*@(.+)"),
|
|
||||||
"url": re.compile(r"https?:\/\/(.*)"),
|
|
||||||
"uri": re.compile(r'template="([^"]+)"'),
|
|
||||||
"pid": re.compile(r"[^\/]+$"),
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
def insert_toot(oii, acc, post, cursor): # extracted to prevent duplication
|
|
||||||
pid = patterns["pid"].search(oii['object']['id']).group(0)
|
|
||||||
cursor.execute("REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)", (
|
|
||||||
pid,
|
|
||||||
oii['object']['summary'] or None,
|
|
||||||
acc.id,
|
|
||||||
oii['object']['id'],
|
|
||||||
post
|
|
||||||
))
|
|
||||||
|
|
||||||
|
|
||||||
for f in following:
|
|
||||||
next_page = c.execute("SELECT next_page FROM cursors WHERE userid = ?", (f.id,)).fetchone()
|
|
||||||
direction = 'next'
|
direction = 'next'
|
||||||
if next_page is not None:
|
if next_page is not None:
|
||||||
next_page ,= next_page
|
next_page ,= next_page
|
||||||
direction = 'prev'
|
direction = 'prev'
|
||||||
print(f'{next_page = }')
|
print('Downloading posts for user @' + acc['acct'])
|
||||||
print("Downloading posts for user @" + f.acct)
|
|
||||||
|
|
||||||
# find the user's activitypub outbox
|
page = await fetch_first_page(cfg, http, acc, next_page)
|
||||||
print("WebFingering...")
|
|
||||||
instance = patterns["handle"].search(f.acct)
|
|
||||||
if instance is None:
|
|
||||||
instance = patterns["url"].search(cfg['site']).group(1)
|
|
||||||
else:
|
|
||||||
instance = instance.group(1)
|
|
||||||
|
|
||||||
if instance in cfg['instance_blacklist']:
|
if 'next' not in page and 'prev' not in page:
|
||||||
print("skipping blacklisted instance: {}".format(instance))
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
# 1. download host-meta to find webfinger URL
|
|
||||||
r = get("https://{}/.well-known/host-meta".format(instance), timeout=10)
|
|
||||||
# 2. use webfinger to find user's info page
|
|
||||||
uri = patterns["uri"].search(r.text).group(1)
|
|
||||||
uri = uri.format(uri="{}@{}".format(f.username, instance))
|
|
||||||
r = get(uri, headers={"Accept": "application/json"}, timeout=10)
|
|
||||||
j = r.json()
|
|
||||||
found = False
|
|
||||||
for link in j['links']:
|
|
||||||
if link['rel'] == 'self':
|
|
||||||
# this is a link formatted like "https://instan.ce/users/username", which is what we need
|
|
||||||
uri = link['href']
|
|
||||||
found = True
|
|
||||||
break
|
|
||||||
if not found:
|
|
||||||
print("Couldn't find a valid ActivityPub outbox URL.")
|
|
||||||
|
|
||||||
# 3. download a page of the outbox
|
|
||||||
uri = next_page or "{}/outbox?page=true".format(uri)
|
|
||||||
r = get(uri, timeout=15)
|
|
||||||
j = r.json()
|
|
||||||
except:
|
|
||||||
print("oopsy woopsy!! we made a fucky wucky!!!\n(we're probably rate limited, please hang up and try again)")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
if 'next' not in j and 'prev' not in j:
|
|
||||||
# there's only one page of results, don't bother doing anything special
|
# there's only one page of results, don't bother doing anything special
|
||||||
pass
|
pass
|
||||||
else:
|
else:
|
||||||
# this is for when we're all done. it points to the activities created *after* we started fetching.
|
# this is for when we're all done. it points to the activities created *after* we started fetching.
|
||||||
next_page = j['prev']
|
next_page = page['prev']
|
||||||
if next_page is None:
|
|
||||||
uri = j[direction]
|
|
||||||
r = get(uri)
|
|
||||||
j = r.json()
|
|
||||||
|
|
||||||
print("Downloading and saving posts", end='', flush=True)
|
print('Downloading and saving posts', end='', flush=True)
|
||||||
done = False
|
done = False
|
||||||
try:
|
while not done and len(page['orderedItems']) > 0:
|
||||||
while not done and len(j['orderedItems']) > 0:
|
try:
|
||||||
for oi in j['orderedItems']:
|
async with anyio.create_task_group() as tg:
|
||||||
if oi['type'] != "Create":
|
for obj in page['orderedItems']:
|
||||||
continue # this isn't a toot/post/status/whatever, it's a boost or a follow or some other activitypub thing. ignore
|
tg.start_soon(process_object, cur, acc, obj)
|
||||||
|
except DoneWithAccount:
|
||||||
# its a toost baby
|
done = True
|
||||||
content = oi['object']['content']
|
continue
|
||||||
toot = extract_toot(content)
|
except anyio.ExceptionGroup as eg:
|
||||||
# print(toot)
|
for exc in eg.exceptions:
|
||||||
try:
|
if isinstance(exc, DoneWithAccount):
|
||||||
if c.execute("SELECT COUNT(*) FROM toots WHERE uri LIKE ?", (oi['object']['id'],)).fetchone()[0] > 0:
|
|
||||||
# we've caught up to the notices we've already downloaded, so we can stop now
|
|
||||||
# you might be wondering, "lynne, what if the instance ratelimits you after 40 posts, and they've made 60 since main.py was last run? wouldn't the bot miss 20 posts and never be able to see them?" to which i reply, "i know but i don't know how to fix it"
|
|
||||||
done = True
|
|
||||||
continue
|
|
||||||
if 'lang' in cfg:
|
|
||||||
try:
|
|
||||||
if oi['object']['contentMap'][cfg['lang']]: # filter for language
|
|
||||||
insert_toot(oi, f, toot, c)
|
|
||||||
except KeyError:
|
|
||||||
# JSON doesn't have contentMap, just insert the toot irregardlessly
|
|
||||||
insert_toot(oi, f, toot, c)
|
|
||||||
else:
|
|
||||||
insert_toot(oi, f, toot, c)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
done = True
|
done = True
|
||||||
break
|
continue
|
||||||
except sqlite3.Error:
|
|
||||||
pass # ignore any toots that don't successfully go into the DB
|
|
||||||
|
|
||||||
# get the next/previous page
|
# get the next/previous page
|
||||||
try:
|
try:
|
||||||
r = get(j[direction], timeout=15)
|
async with http.get(page[direction], timeout=15) as resp:
|
||||||
except requests.Timeout:
|
page = await resp.json()
|
||||||
print("HTTP timeout, site did not respond within 15 seconds")
|
except asyncio.TimeoutError:
|
||||||
except KeyError:
|
print('HTTP timeout, site did not respond within 15 seconds', file=sys.stderr)
|
||||||
print("Couldn't get next page - we've probably got all the posts")
|
except KeyError:
|
||||||
except KeyboardInterrupt:
|
print("Couldn't get next page - we've probably got all the posts", file=sys.stderr)
|
||||||
|
except KeyboardInterrupt:
|
||||||
|
done = True
|
||||||
|
break
|
||||||
|
except aiohttp.ClientResponseError as exc:
|
||||||
|
if exc.code == HTTPStatus.TOO_MANY_REQUESTS:
|
||||||
|
print("We're rate limited. Skipping to next account.")
|
||||||
done = True
|
done = True
|
||||||
break
|
break
|
||||||
except:
|
raise
|
||||||
print("An error occurred while trying to obtain more posts.")
|
except Exception:
|
||||||
|
import traceback
|
||||||
|
print('An error occurred while trying to obtain more posts:', file=sys.stderr)
|
||||||
|
traceback.print_exc()
|
||||||
|
|
||||||
j = r.json()
|
print('.', end='', flush=True)
|
||||||
print('.', end='', flush=True)
|
else:
|
||||||
else:
|
# the while loop ran without breaking
|
||||||
# the while loop ran without breaking
|
await cur.execute('REPLACE INTO cursors (userid, next_page) VALUES (?, ?)', (acc['id'], next_page))
|
||||||
c.execute("REPLACE INTO cursors (userid, next_page) VALUES (?, ?)", (f.id, next_page))
|
await cur.connection.commit()
|
||||||
db.commit()
|
|
||||||
|
|
||||||
print(" Done!")
|
print(' Done!')
|
||||||
except requests.HTTPError as e:
|
|
||||||
if e.response.status_code == 429:
|
|
||||||
print("Rate limit exceeded. This means we're downloading too many posts in quick succession. Saving toots to database and moving to next followed account.")
|
|
||||||
db.commit()
|
|
||||||
else:
|
|
||||||
# TODO: remove duplicate code
|
|
||||||
print("Encountered an error! Saving posts to database and moving to next followed account.")
|
|
||||||
db.commit()
|
|
||||||
except Exception:
|
|
||||||
print("Encountered an error! Saving posts to database and moving to next followed account.")
|
|
||||||
db.commit()
|
|
||||||
|
|
||||||
print("Done!")
|
async def finger(cfg, http, acc):
|
||||||
|
instance = PATTERNS['handle'].search(acc['acct'])
|
||||||
|
if instance is None:
|
||||||
|
instance = PATTERNS['base_url'].search(cfg['site'])[1]
|
||||||
|
else:
|
||||||
|
instance = instance[1]
|
||||||
|
|
||||||
db.commit()
|
# 1. download host-meta to find webfinger URL
|
||||||
db.execute("VACUUM") # compact db
|
async with http.get('https://{}/.well-known/host-meta'.format(instance), timeout=10) as resp:
|
||||||
db.commit()
|
host_meta = await resp.text()
|
||||||
db.close()
|
|
||||||
|
# 2. use webfinger to find user's info page
|
||||||
|
webfinger_url = PATTERNS['webfinger_template_url'].search(host_meta).group(1)
|
||||||
|
webfinger_url = webfinger_url.format(uri='{}@{}'.format(acc['username'], instance))
|
||||||
|
|
||||||
|
async with http.get(webfinger_url, headers={'Accept': 'application/json'}, timeout=10) as resp:
|
||||||
|
profile = await resp.json()
|
||||||
|
|
||||||
|
for link in profile['links']:
|
||||||
|
if link['rel'] == 'self':
|
||||||
|
# this is a link formatted like 'https://instan.ce/users/username', which is what we need
|
||||||
|
return link['href']
|
||||||
|
|
||||||
|
print("Couldn't find a valid ActivityPub outbox URL.", file=sys.stderr)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
class DoneWithAccount(Exception): pass
|
||||||
|
|
||||||
|
async def process_object(cur, acc, obj):
|
||||||
|
if obj['type'] != 'Create':
|
||||||
|
# this isn't a toot/post/status/whatever, it's a boost or a follow or some other activitypub thing. ignore
|
||||||
|
return
|
||||||
|
|
||||||
|
# its a toost baby
|
||||||
|
content = obj['object']['content']
|
||||||
|
toot = extract_toot(content)
|
||||||
|
try:
|
||||||
|
await cur.execute('SELECT COUNT(*) FROM toots WHERE uri = ?', (obj['object']['id'],))
|
||||||
|
existing = await cur.fetchone()
|
||||||
|
if existing is not None and existing[0]:
|
||||||
|
# we've caught up to the notices we've already downloaded, so we can stop now
|
||||||
|
# you might be wondering, 'lynne, what if the instance ratelimits you after 40 posts, and they've made 60 since main.py was last run? wouldn't the bot miss 20 posts and never be able to see them?' to which i reply, 'i know but i don't know how to fix it'
|
||||||
|
raise DoneWithAccount
|
||||||
|
await insert_toot(cur, acc, obj, toot)
|
||||||
|
except sqlite3.Error:
|
||||||
|
pass # ignore any toots that don't successfully go into the DB
|
||||||
|
|
||||||
|
async def fetch_first_page(cfg, http, acc, next_page):
|
||||||
|
# download a page of the outbox
|
||||||
|
if not next_page:
|
||||||
|
print('Fingering UwU...')
|
||||||
|
# find the user's activitypub outbox
|
||||||
|
outbox_url = await finger(cfg, http, acc) + '/outbox?page=true'
|
||||||
|
else:
|
||||||
|
outbox_url = next_page
|
||||||
|
|
||||||
|
async with http.get(outbox_url, timeout=15) as resp:
|
||||||
|
return await resp.json()
|
||||||
|
|
||||||
|
def extract_toot(toot):
|
||||||
|
toot = functions.extract_toot(toot)
|
||||||
|
toot = toot.replace('@', '@\u200B') # put a zws between @ and username to avoid mentioning
|
||||||
|
return(toot)
|
||||||
|
|
||||||
|
async def insert_toot(cursor, acc, obj, content):
|
||||||
|
post_id = PATTERNS['post_id'].search(obj['object']['id']).group(0)
|
||||||
|
await cursor.execute('REPLACE INTO toots (id, cw, userid, uri, content) VALUES (?, ?, ?, ?, ?)', (
|
||||||
|
post_id,
|
||||||
|
obj['object']['summary'] or None,
|
||||||
|
acc['id'],
|
||||||
|
obj['object']['id'],
|
||||||
|
content,
|
||||||
|
))
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
anyio.run(main)
|
||||||
|
|
|
@ -0,0 +1,80 @@
|
||||||
|
# SPDX-License-Identifier: EUPL-1.2
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import aiohttp
|
||||||
|
|
||||||
|
USER_AGENT = (
|
||||||
|
'pleroma-ebooks (https://github.com/ioistired/pleroma-ebooks); '
|
||||||
|
'aiohttp/{aiohttp.__version__}; '
|
||||||
|
'python/{py_version}'
|
||||||
|
)
|
||||||
|
|
||||||
|
def http_session_factory(headers={}):
|
||||||
|
return aiohttp.ClientSession(
|
||||||
|
headers={'User-Agent': USER_AGENT, **headers},
|
||||||
|
raise_for_status=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
class Pleroma:
|
||||||
|
def __init__(self, *, api_base_url, access_token):
|
||||||
|
self.api_base_url = api_base_url.rstrip('/')
|
||||||
|
py_version = '.'.join(map(str, sys.version_info))
|
||||||
|
self._session = http_session_factory({'Authorization': 'Bearer ' + access_token})
|
||||||
|
self._logged_in_id = None
|
||||||
|
|
||||||
|
async def __aenter__(self):
|
||||||
|
self._session = await self._session.__aenter__()
|
||||||
|
return self
|
||||||
|
|
||||||
|
async def __aexit__(self, *excinfo):
|
||||||
|
return await self._session.__aexit__(*excinfo)
|
||||||
|
|
||||||
|
async def request(self, method, path, **kwargs):
|
||||||
|
async with self._session.request(method, self.api_base_url + path, **kwargs) as resp:
|
||||||
|
return await resp.json()
|
||||||
|
|
||||||
|
async def verify_credentials(self):
|
||||||
|
return await self.request('GET', '/api/v1/accounts/verify_credentials')
|
||||||
|
|
||||||
|
me = verify_credentials
|
||||||
|
|
||||||
|
async def _get_logged_in_id(self):
|
||||||
|
if self._logged_in_id is None:
|
||||||
|
self._logged_in_id = (await self.me())['id']
|
||||||
|
return self._logged_in_id
|
||||||
|
|
||||||
|
async def following(self, account_id=None):
|
||||||
|
account_id = account_id or await self._get_logged_in_id()
|
||||||
|
return await self.request('GET', f'/api/v1/accounts/{account_id}/following')
|
||||||
|
|
||||||
|
async def post(self, content, *, in_reply_to_id=None, cw=None, visibility=None):
|
||||||
|
if visibility not in {None, 'private', 'public', 'unlisted', 'direct'}:
|
||||||
|
raise ValueError('invalid visibility', visibility)
|
||||||
|
|
||||||
|
if isinstance(in_reply_to_id, dict) and 'id' in in_reply_to_id:
|
||||||
|
in_reply_to_id = in_reply_to_id['id']
|
||||||
|
|
||||||
|
data = dict(status=content, in_reply_to_id=in_reply_to_id)
|
||||||
|
if visibility is not None:
|
||||||
|
data['visibility'] = visibility
|
||||||
|
if cw is not None:
|
||||||
|
data['spoiler_text'] = cw
|
||||||
|
|
||||||
|
return await self.request('POST', '/api/v1/statuses', data=data)
|
||||||
|
|
||||||
|
async def reply(self, to_status, content, *, cw=None):
|
||||||
|
user_id = await self._get_logged_in_id()
|
||||||
|
|
||||||
|
mentioned_accounts = {}
|
||||||
|
mentioned_accounts[to_status['account']['id']] = to_status['account']['acct']
|
||||||
|
for account in to_status['mentions']:
|
||||||
|
if account['id'] != user_id and account['id'] not in mentioned_accounts:
|
||||||
|
mentioned_accounts[account.id] = account.acct
|
||||||
|
|
||||||
|
status = ''.join('@' + x + ' ' for x in mentioned_accounts.values()) + content
|
||||||
|
|
||||||
|
visibility = to_status['visibility']
|
||||||
|
if cw is None and 'spoiler_text' in to_status:
|
||||||
|
cw = 're: ' + to_status['spoiler_text']
|
||||||
|
|
||||||
|
return await self.post(content, in_reply_to_id=to_status['id'], cw=cw, visibility=visibility)
|
|
@ -1,4 +1,5 @@
|
||||||
Mastodon.py==1.5.1
|
markovify ~= 0.8
|
||||||
markovify==0.8.2
|
|
||||||
beautifulsoup4==4.9.1
|
beautifulsoup4==4.9.1
|
||||||
requests==2.24.0
|
aiohttp ~= 3.0
|
||||||
|
pytomlpp ~= 1.0
|
||||||
|
anyio ~= 3.0
|
||||||
|
|
Loading…
Reference in New Issue