Skip to content
Snippets Groups Projects
Commit 636cf526 authored by c-tim's avatar c-tim
Browse files

Merge branch 'pad-downloader' into 'main'

Pad downloader

See merge request !11
parents 311e82ba 1274fefe
No related branches found
No related tags found
1 merge request!11Pad downloader
#!/usr/bin/env python3
from typing import Dict, Optional
from os import environ, path, getcwd, makedirs
from dotenv import load_dotenv
from urllib.parse import urlparse, urlunparse, ParseResult
import argparse
import requests
import re
class EnvDefault(argparse.Action):
def __init__(self, envvar, required=True, default=None, **kwargs):
if not default and envvar:
if envvar in environ:
default = environ[envvar]
if required and default:
required = False
super(EnvDefault, self).__init__(default=default, required=required,
**kwargs)
def __call__(self, parser, namespace, values, option_string=None):
setattr(namespace, self.dest, values)
def _download_uploaded_file(
file: ParseResult,
outdir: str,
connectsid: Optional[str]):
headers: Dict[str, str] = {}
cookies: Dict[str, str] = {}
if connectsid is not None:
cookies['connect.sid'] = connectsid
r: requests.Response = requests.get(urlunparse(file), headers=headers, cookies=cookies)
if not r.ok:
print(f"Failed to download file {file}. Status: {r.status_code}. Response: {r.content.decode('utf-8')}")
return
file_name = file.path.split("/")[-1]
outfile = outdir + "/uploads/" + file_name
dir_of_outfile = path.dirname(outfile)
if not path.exists(dir_of_outfile):
makedirs(dir_of_outfile)
with open(outfile, 'wb') as f:
f.write(r.content)
def _download_pad_recursive(
note: ParseResult,
url_pattern: re.Pattern,
outdir: str,
connectsid: Optional[str],
downloaded: dict[str, str]):
if "/uploads/" in note.path:
_download_uploaded_file(note, outdir, connectsid)
return
if urlunparse(note) in downloaded:
return
headers: Dict[str, str] = {'Accept': 'text/markdown'}
cookies: Dict[str, str] = {}
if connectsid is not None:
cookies['connect.sid'] = connectsid
note_dict = note._asdict()
note_dict["path"] = note.path + "/download"
request_url = ParseResult(**note_dict)
r: requests.Response = requests.get(urlunparse(request_url), headers=headers, cookies=cookies)
if not r.ok:
print(f"Failed to download pad {note}. Status: {r.status_code}. Response: {r.content.decode('utf-8')}")
return
if not r.headers["Content-Type"].startswith("text/markdown;"):
print(f"Pad {note} has wrong content type: {r.headers['Content-Type']}.")
return
response_content = r.content
note_name = note.path.split("/")[-1]
outfile = outdir + "/" + note_name + ".md"
dir_of_outfile = path.dirname(outfile)
if not path.exists(dir_of_outfile):
makedirs(dir_of_outfile)
with open(outfile, 'wb') as f:
f.write(response_content)
downloaded[urlunparse(note)] = outfile
content_string = response_content.decode('utf-8')
for match in url_pattern.finditer(content_string):
url_path = match.group("path")
new_note_url = ParseResult(
scheme=note.scheme,
netloc=note.netloc,
path=url_path,
params="",
query="",
fragment="")
_download_pad_recursive(new_note_url, url_pattern, outdir, connectsid, downloaded)
def download_pads(initial_note: str,
outdir: str,
connectsid: Optional[str] = None) -> Dict[str, str]:
parsed_url = urlparse(initial_note)
url_pattern = re.compile(re.escape(parsed_url.scheme) + r":\/\/" + re.escape(parsed_url.netloc) + r"\b(?P<path>[-a-zA-Z0-9@:%_\+.~&\/\/=]*)")
initial_note_normalized: ParseResult = ParseResult(
scheme=parsed_url.scheme,
netloc=parsed_url.netloc,
path=parsed_url.path,
params="",
query="",
fragment="")
result: dict[str, str] = {}
_download_pad_recursive(initial_note_normalized, url_pattern, outdir, connectsid, result)
return result
def main():
BASE_DIR = path.abspath(getcwd())
load_dotenv(dotenv_path=path.join(BASE_DIR, ".env")) # Force load from cwd
load_dotenv() # Additionally find with default search algo
parser = argparse.ArgumentParser(description='Download all recursively reachable hedgedoc notes/pads as markdown files on the same server')
parser.add_argument('--connectsid', '-c', metavar='csid', action=EnvDefault, envvar='PI_CONNECT_SID', required=False,
help='The connection sid to use. This is required to authenticate the user. If not set the pads are created by the system. Can also be specified via the environment variable PI_CONNECT_SID')
parser.add_argument('--outdir', '-o', metavar='outdir', action=EnvDefault, envvar='PI_DOWNLOAD_DIR', required=False, default="downloads")
parser.add_argument('initial_note', nargs=1,
help='The initial note from which to search notes/pads.')
args = parser.parse_args()
result = download_pads(args.initial_note[0], args.outdir, args.connectsid)
for link, file_name in result.items():
print("Dowloaded note/pad {} as {}".format(link, file_name))
if __name__ == "__main__":
main()
#!/usr/bin/env python3
from typing import List, Dict, Optional, Tuple, Iterable
from typing import List, Dict, Optional, Iterable
from jinja2 import Template, StrictUndefined
......@@ -62,7 +62,7 @@ def initPads(files: List[str],
outdir: str,
server: str,
connectsid: Optional[str] = None,
pre_configured_links: Dict[str, str] = {}) -> List[Tuple[str, str]]:
pre_configured_links: Dict[str, str] = {}) -> Dict[str, str]:
"""
Initializes pads for the given files
The files should exists and be readable
......@@ -82,7 +82,7 @@ def initPads(files: List[str],
if connectsid is not None:
cookies['connect.sid'] = connectsid
result = {}
result: dict[str, str] = {}
all_the_links = pre_configured_links.copy()
......@@ -112,10 +112,12 @@ def initPads(files: List[str],
r: requests.Response = requests.post(url_for_this_file, headers=headers, cookies=cookies, data=pad_content.encode('utf-8'), allow_redirects=False)
if not r.ok:
print(f"Failed to create pad for {file_name}. Status: {r.status_code}. Response: {r.content.decode('utf-8')}")
continue
if not r.is_redirect:
print(f"Failed to create pad for {file_name}. Response was not a well formed redirect. Response: {r.content.decode('utf-8')}")
continue
created_page_link = r.next.url
created_page_link: str = r.next.url # type: ignore
all_the_links[file_name] = created_page_link
......
[tool.poetry]
name = "pad_initiator"
version = "0.1.0"
version = "0.2.0"
description = ""
authors = ["C-Tim <tim@c-hack.de>"]
license = "MIT"
[tool.poetry.scripts]
pad_initiator = 'pad_initiator:main'
pad_downloader = 'pad_downloader:main'
[tool.poetry.dependencies]
python = "^3.9"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment