refactor tile_gen in Python

This commit is contained in:
Zsolt Ero
2024-08-27 01:47:34 +02:00
parent 41f49b0743
commit 08d17df476
17 changed files with 352 additions and 273 deletions

View File

@@ -1,102 +0,0 @@
#!/usr/bin/env bash
set -e
TILE_GEN_BIN=/data/ofm/tile_gen/bin
VENV_PYTHON=/data/ofm/venv/bin/python
sudo umount mnt_rw 2> /dev/null || true
sudo umount mnt_rw2 2> /dev/null || true
rm -rf mnt_rw* tmp_*
rm -f -- *.btrfs *.gz
rm -rf -- *.log *.txt logs
# make an empty file that's definitely bigger then the current OSM output
fallocate -l 200G image.btrfs
fallocate -l 200G image2.btrfs
# metadata: single needed as default is now DUP
mkfs.btrfs \
-m single \
image.btrfs > /dev/null
mkfs.btrfs \
-m single \
image2.btrfs > /dev/null
# https://btrfs.readthedocs.io/en/latest/btrfs-man5.html#mount-options
# compression doesn't make sense, data is already gzip compressed
mkdir -p mnt_rw mnt_rw2
sudo mount \
-t btrfs \
-o noacl,nobarrier,noatime,max_inline=4096 \
image.btrfs mnt_rw
sudo mount \
-t btrfs \
-o noacl,nobarrier,noatime,max_inline=4096 \
image2.btrfs mnt_rw2
sudo chown ofm:ofm -R mnt_rw mnt_rw2
$VENV_PYTHON $TILE_GEN_BIN/extract_mbtiles/extract_mbtiles.py \
tiles.mbtiles mnt_rw/extract \
> extract_out.log 2> extract_err.log
cp mnt_rw/extract/osm_date .
grep fixed extract_out.log > dedupl_fixed.log || true
# Unfortunately, by deleting files from the btrfs partition, the size _grows_.
# So we need to rsync onto a new partition.
rsync -avH \
--max-alloc=4294967296 \
--exclude dedupl \
mnt_rw/extract/ mnt_rw2/ \
> rsync_out.log 2> rsync_err.log
# collect stats
{
echo -e "df -h"
sudo df -h mnt_rw
echo -e "\n\nbtrfs filesystem df"
sudo btrfs filesystem df mnt_rw
echo -e "\n\nbtrfs filesystem show"
sudo btrfs filesystem show mnt_rw
echo -e "\n\nbtrfs filesystem usage"
sudo btrfs filesystem usage mnt_rw
} > stats1.txt
{
echo -e "df -h"
sudo df -h mnt_rw2
echo -e "\n\nbtrfs filesystem df"
sudo btrfs filesystem df mnt_rw2
echo -e "\n\nbtrfs filesystem show"
sudo btrfs filesystem show mnt_rw2
echo -e "\n\nbtrfs filesystem usage"
sudo btrfs filesystem usage mnt_rw2
} > stats2.txt
sudo umount mnt_rw
sudo umount mnt_rw2
rm -r mnt_rw*
sudo $VENV_PYTHON $TILE_GEN_BIN/shrink_btrfs/shrink_btrfs.py image2.btrfs \
> shrink_out.log 2> shrink_err.log
rm image.btrfs
mv image2.btrfs tiles.btrfs
pigz tiles.btrfs --fast
mkdir -p logs
mv -- *.log logs
mv -- *.txt logs
echo extract_btrfs.sh DONE

View File

@@ -1,31 +0,0 @@
#!/usr/bin/env bash
set -e
TILE_GEN_BIN=/data/ofm/tile_gen/bin
AREA=monaco
DATE=$(date +"%Y%m%d_%H%M%S")
RUN_FOLDER="/data/ofm/tile_gen/runs/$AREA/${DATE}_pt"
mkdir -p "$RUN_FOLDER"
cd "$RUN_FOLDER" || exit
java -Xmx1g \
-jar $TILE_GEN_BIN/planetiler.jar \
`# Download the latest osm.pbf from s3://osm-pds bucket` \
--area=$AREA --download \
`# Accelerate the download by fetching the 10 1GB chunks at a time in parallel` \
--download-threads=10 --download-chunk-size-mb=1000 \
`# Also download name translations from wikidata` \
--fetch-wikidata \
--output=tiles.mbtiles \
`# Store temporary node locations at fixed positions in a memory-mapped file` \
--nodemap-type=array --storage=mmap \
--force \
> planetiler.out 2> planetiler.err
rm -r data
echo planetiler.jar DONE
$TILE_GEN_BIN/extract_btrfs.sh

View File

@@ -1,33 +0,0 @@
#!/usr/bin/env bash
set -e
TILE_GEN_BIN=/data/ofm/tile_gen/bin
AREA=planet
DATE=$(date +"%Y%m%d_%H%M%S")
RUN_FOLDER="/data/ofm/tile_gen/runs/$AREA/${DATE}_pt"
mkdir -p "$RUN_FOLDER"
cd "$RUN_FOLDER" || exit
# the Xmx value below the most important parameter here
# 30 GB works well
java -Xmx30g \
-jar $TILE_GEN_BIN/planetiler.jar \
`# Download the latest planet.osm.pbf from s3://osm-pds bucket` \
--area=planet --bounds=planet --download \
`# Accelerate the download by fetching the 10 1GB chunks at a time in parallel` \
--download-threads=10 --download-chunk-size-mb=1000 \
`# Also download name translations from wikidata` \
--fetch-wikidata \
--output=tiles.mbtiles \
`# Store temporary node locations at fixed positions in a memory-mapped file` \
--nodemap-type=array --storage=mmap \
--force \
> planetiler.out 2> planetiler.err
rm -r data
echo planetiler.jar DONE
$TILE_GEN_BIN/extract_btrfs.sh

View File

@@ -0,0 +1,2 @@
These are self contained Python scripts, they can be run outside of this project's environment.

79
scripts/tile_gen/tile_gen.py Executable file
View File

@@ -0,0 +1,79 @@
#!/usr/bin/env python3
import json
import subprocess
from pathlib import Path
import click
from tile_gen_lib.config import config
from tile_gen_lib.extract import make_btrfs
from tile_gen_lib.planetiler import run_planetiler
from tile_gen_lib.upload import make_indexes, upload_rclone
@click.group()
def cli():
"""
Generates tiles and uploads to CloudFlare
"""
@cli.command()
@click.argument('area', required=True)
def make_tiles(area):
"""
Generate tiles for a given area
"""
# run_planetiler(area)
make_btrfs(Path('/data/ofm/tile_gen/runs/monaco/20240826_230406_pt'))
@cli.command()
def upload_runs():
"""
Upload all runs present in system
"""
print('running upload_runs')
for area in config.areas:
if not (config.runs_dir / area).exists():
continue
p = subprocess.run(
[
'rclone',
'lsjson',
'--dirs-only',
'--fast-list',
f'remote:ofm-{area}',
],
text=True,
capture_output=True,
check=True,
env=dict(RCLONE_CONFIG='/data/ofm/config/rclone.conf'),
)
rclone_json = json.loads(p.stdout)
runs_remote = {p['Path'] for p in rclone_json}
runs_local = {p.name for p in (config.runs_dir / area).iterdir()}
runs_to_upload = runs_local - runs_remote
for run in runs_to_upload:
print(f'uploading {area} {run}')
upload_rclone(area, run)
make_indexes()
@cli.command()
def index():
"""
Run index on Cloudflare buckets
"""
make_indexes()
if __name__ == '__main__':
cli()

View File

@@ -0,0 +1,18 @@
from pathlib import Path
class Configuration:
tile_gen_dir = Path('/data/ofm/tile_gen')
tile_gen_bin = tile_gen_dir / 'bin'
tile_gen_scripts_dir = tile_gen_bin / 'scripts'
planetiler_bin = tile_gen_dir / 'planetiler'
planetiler_path = planetiler_bin / 'planetiler.jar'
runs_dir = tile_gen_dir / 'runs'
areas = ['planet', 'monaco']
config = Configuration()

View File

@@ -0,0 +1,136 @@
import os
import shutil
import subprocess
import sys
from pathlib import Path
from tile_gen_lib.config import config
from tile_gen_lib.utils import python_venv_executable
IMAGE_SIZE = '200G'
def make_btrfs(run_folder: Path):
os.chdir(run_folder)
# cleanup
for mount in ['mnt_rw', 'mnt_rw2']:
subprocess.run(['sudo', 'umount', mount], capture_output=True)
for pattern in ['mnt_rw*', 'tmp_*', '*.btrfs', '*.gz', '*.log', '*.txt', 'logs', 'osm_date']:
for item in Path().glob(pattern):
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
# make an empty file that's definitely bigger then the current OSM output
for image in ['image.btrfs', 'image2.btrfs']:
subprocess.run(['fallocate', '-l', IMAGE_SIZE, image], check=True)
subprocess.run(['mkfs.btrfs', '-m', 'single', image], check=True, capture_output=True)
for image, mount in [('image.btrfs', 'mnt_rw'), ('image2.btrfs', 'mnt_rw2')]:
Path(mount).mkdir()
# https://btrfs.readthedocs.io/en/latest/btrfs-man5.html#mount-options
# compression doesn't make sense, data is already gzip compressed
subprocess.run(
[
'sudo',
'mount',
'-t',
'btrfs',
'-o',
'noacl,nobarrier,noatime,max_inline=4096',
image,
mount,
],
check=True,
)
subprocess.run(['sudo', 'chown', 'ofm:ofm', '-R', mount], check=True)
# extract mbtiles
extract_script = config.tile_gen_scripts_dir / 'extract_mbtiles.py'
with open('extract_out.log', 'w') as out, open('extract_err.log', 'w') as err:
subprocess.run(
[
python_venv_executable(),
extract_script,
'tiles.mbtiles',
'mnt_rw/extract',
],
check=True,
stdout=out,
stderr=err,
)
shutil.copy('mnt_rw/extract/osm_date', '.')
# process logs
subprocess.run('grep fixed extract_out.log > dedupl_fixed.log', shell=True)
# unfortunately, by deleting files from the btrfs partition, the partition size grows
# so we need to rsync onto a new partition instead of deleting
with open('rsync_out.log', 'w') as out, open('rsync_err.log', 'w') as err:
subprocess.run(
[
'rsync',
'-avH',
'--max-alloc=4294967296',
'--exclude',
'dedupl',
'mnt_rw/extract/',
'mnt_rw2/',
],
check=True,
stdout=out,
stderr=err,
)
# collect stats
for i, mount in enumerate(['mnt_rw', 'mnt_rw2'], 1):
with open(f'stats{i}.txt', 'w') as f:
for cmd in [
['df', '-h', mount],
['btrfs', 'filesystem', 'df', mount],
['btrfs', 'filesystem', 'show', mount],
['btrfs', 'filesystem', 'usage', mount],
]:
f.write(f"\n\n{' '.join(cmd)}\n")
result = subprocess.run(['sudo'] + cmd, check=True, capture_output=True, text=True)
f.write(result.stdout)
# unmount and cleanup
for mount in ['mnt_rw', 'mnt_rw2']:
subprocess.run(['sudo', 'umount', mount], check=True)
shutil.rmtree('mnt_rw')
shutil.rmtree('mnt_rw2')
# shrink btrfs
shrink_script = config.tile_gen_scripts_dir / 'shrink_btrfs.py'
with open('shrink_out.log', 'w') as out, open('shrink_err.log', 'w') as err:
subprocess.run(
['sudo', python_venv_executable(), shrink_script, 'image2.btrfs'],
check=True,
stdout=out,
stderr=err,
)
os.unlink('image.btrfs')
shutil.move('image2.btrfs', 'tiles.btrfs')
# parallel gzip
subprocess.run(['pigz', 'tiles.btrfs', '--fast'], check=True)
# logs
Path('logs').mkdir()
for pattern in ['*.log', '*.txt']:
for file in Path().glob(pattern):
shutil.move(file, 'logs')
print('extract_btrfs.py DONE')
return run_folder

View File

@@ -0,0 +1,53 @@
import os
import shutil
import subprocess
from datetime import datetime, timezone
from pathlib import Path
from tile_gen_lib.config import config
def run_planetiler(area: str) -> Path:
assert area in config.areas
date = datetime.now(tz=timezone.utc).strftime('%Y%m%d_%H%M%S')
# delete all previous runs for the given area
shutil.rmtree(config.runs_dir / area, ignore_errors=True)
run_folder = config.runs_dir / area / f'{date}_pt'
run_folder.mkdir(parents=True, exist_ok=True)
os.chdir(run_folder)
java_memory_gb = 30 if area == 'planet' else 1
command = [
'java',
f'-Xmx{java_memory_gb}g',
'-jar',
config.planetiler_path,
f'--area={area}',
'--download',
'--download-threads=10',
'--download-chunk-size-mb=1000',
'--fetch-wikidata',
'--output=tiles.mbtiles',
'--nodemap-type=array',
'--storage=mmap',
'--force',
]
if area == 'planet':
command += '--bounds=planet'
out_path = run_folder / 'planetiler.out'
err_path = run_folder / 'planetiler.err'
with out_path.open('w') as out_file, err_path.open('w') as err_file:
subprocess.run(command, stdout=out_file, stderr=err_file, check=True, cwd=run_folder)
shutil.rmtree(run_folder / 'data', ignore_errors=True)
print('planetiler.jar DONE')
return run_folder

View File

@@ -1,15 +1,6 @@
#!/usr/bin/env python3
import json
import pathlib
import shutil
import subprocess
import click
AREAS = ['planet', 'monaco']
RUNS_DIR = pathlib.Path('/data/ofm/tile_gen/runs')
from tile_gen_lib.config import config
def upload_rclone(area, run):
@@ -25,10 +16,10 @@ def upload_rclone(area, run):
'0',
'--stats-one-line',
'--log-file',
RUNS_DIR / area / run / 'logs' / 'rclone.log',
config.runs_dir / area / run / 'logs' / 'rclone.log',
'--exclude',
'logs/**',
RUNS_DIR / area / run,
config.runs_dir / area / run,
f'remote:ofm-{area}/{run}',
],
env=dict(RCLONE_CONFIG='/data/ofm/config/rclone.conf'),
@@ -37,7 +28,7 @@ def upload_rclone(area, run):
def make_indexes():
for area in AREAS:
for area in config.areas:
print(f'creating index {area}')
# files
@@ -100,60 +91,3 @@ def make_indexes():
check=True,
input=index_str.encode(),
)
@click.group()
def cli():
"""
Uploads runs to Cloudflare
"""
@cli.command()
def upload_runs():
"""
Upload all runs present in system
"""
print('running upload_runs')
for area in AREAS:
if not (RUNS_DIR / area).exists():
continue
p = subprocess.run(
[
'rclone',
'lsjson',
'--dirs-only',
'--fast-list',
f'remote:ofm-{area}',
],
text=True,
capture_output=True,
check=True,
env=dict(RCLONE_CONFIG='/data/ofm/config/rclone.conf'),
)
rclone_json = json.loads(p.stdout)
runs_remote = {p['Path'] for p in rclone_json}
runs_local = {p.name for p in (RUNS_DIR / area).iterdir()}
runs_to_upload = runs_local - runs_remote
for run in runs_to_upload:
print(f'uploading {area} {run}')
upload_rclone(area, run)
make_indexes()
@cli.command()
def index():
"""
Run index on Cloudflare buckets
"""
make_indexes()
if __name__ == '__main__':
cli()

View File

@@ -0,0 +1,14 @@
import os
import sys
from pathlib import Path
def python_venv_executable() -> Path:
venv_path = os.environ.get('VIRTUAL_ENV')
if venv_path:
return Path(venv_path) / 'bin' / 'python'
elif sys.prefix != sys.base_prefix:
return Path(sys.prefix) / 'bin' / 'python'
else:
return Path(sys.executable)