Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 133 additions & 9 deletions agent/site.py
Original file line number Diff line number Diff line change
Expand Up @@ -481,10 +481,40 @@ def update_plan(self, plan):
self.bench_execute(f"update-site-plan {plan}")

@step("Backup Site")
def backup(self, with_files=False):
def backup(self, with_files=False, backup_path_db=None, backup_path_conf=None, backup_path_private_files=None, backup_path_files=None):
with_files = "--with-files" if with_files else ""
self.bench_execute(f"backup {with_files} --verbose")
return self.fetch_latest_backup(with_files=with_files)
backup_path_db = f"--backup-path-db {backup_path_db}" if backup_path_db else ""
backup_path_conf = f"--backup-path-conf {backup_path_conf}" if backup_path_conf else ""
backup_path_files = f"--backup-path-files {backup_path_files}" if backup_path_files else ""
backup_path_private_files = f"--backup-path-private-files {backup_path_private_files}" if backup_path_private_files else ""

self.bench_execute(f"backup {with_files} {backup_path_conf} {backup_path_files} {backup_path_private_files} {backup_path_db} --verbose")
return {
'database': {
'path': '/home/frappe/benches/bench-40545-000001-u10-ksa/sites/test-s3-streaming-uploads.frappe.cloud/private/backups/database.sql.gz',
'file': '20260522_134200-test-s3-streaming-uploads_frappe_cloud-database.sql.gz',
'size': 10000,
'url':'https://test-s3-streaming-uploads.frappe.cloud/backups/database.sql.gz'
},
'site_config': {
'path': '/home/frappe/benches/bench-40545-000001-u10-ksa/sites/test-s3-streaming-uploads.frappe.cloud/private/backups/site_config_backup.json',
'file': 'site_config_backup.json',
'size': 328,
'url': 'https://test-s3-streaming-uploads.frappe.cloud/backups/site_config_backup.json'
},
'private': {
'path': '/home/frappe/benches/bench-40545-000001-u10-ksa/sites/test-s3-streaming-uploads.frappe.cloud/private/backups/private-files.tar',
'file': 'private-files.tar',
'size': 10240,
'url': 'https://test-s3-streaming-uploads.frappe.cloud/backups/private-files.tar'
},
'public': {
'path': '/home/frappe/benches/bench-40545-000001-u10-ksa/sites/test-s3-streaming-uploads.frappe.cloud/private/backups/files.tar',
'file': 'files.tar',
'size': 10240,
'url': 'https://test-s3-streaming-uploads.frappe.cloud/backups/files.tar'
}
}
Comment on lines +492 to +517
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 Hardcoded test data replaces real backup result

backup() now unconditionally returns a hard-coded dict containing a specific site name (test-s3-streaming-uploads.frappe.cloud), fixed file paths, and fixed sizes instead of calling self.fetch_latest_backup() as before. Every backup in production will return this stale, wrong data — callers that rely on real paths (e.g., upload_offsite_backup which opens backup_file["path"]) will either silently upload to the wrong location or raise a FileNotFoundError when the hard-coded path does not exist.


@step("Upload Site Backup to S3")
def upload_offsite_backup(self, backup_files, offsite, keep_files_locally_after_offsite_backup: bool):
Expand Down Expand Up @@ -830,12 +860,106 @@ def backup_job(
offsite=None,
keep_files_locally_after_offsite_backup: bool = False,
):
backup_files = self.backup(with_files)
uploaded_files = (
self.upload_offsite_backup(backup_files, offsite, keep_files_locally_after_offsite_backup)
if (offsite and backup_files)
else {}
)
if offsite:
if keep_files_locally_after_offsite_backup:
backup_files = self.backup(with_files)
uploaded_files = (
self.upload_offsite_backup(backup_files, offsite, keep_files_locally_after_offsite_backup)
if (offsite and backup_files)
else {}
)
Comment on lines +867 to +870
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P0 Mixed tabs and spaces cause TabError at import time

Lines 867–870, 894–899, and 901 use hard-tab indentation inside blocks that are indented with spaces. Python 3 forbids mixing tabs and spaces for indentation and raises TabError: inconsistent use of tabs and spaces in indentation at parse time — before any code runs. This means the entire agent/site.py module will fail to import, taking down every job that uses it. The same pattern repeats at the # get s3 config block (line 894) and # make rclone rcat start listening (line 901).

else:
# setup fifo
# todays_dt = datetime.now().strftime("%Y%m%d_%H%M%S")
# public_file = todays_dt + '-files.tar'
# db_file = todays_dt + '-database.sql.gz'
# private_file = todays_dt + '-private-files.tar'
# config_file = todays_dt + '-site_config_backup.json'

public_file = 'files.tar'
db_file = 'database.sql.gz'
private_file = 'private-files.tar'
config_file = 'site_config_backup.json'
Comment on lines +872 to +882
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Static, timestamp-less filenames risk collisions and residual data

The commented-out block above shows the original intent was to use datetime-prefixed names (e.g., 20260522_134200-database.sql.gz). The active code uses bare names (database.sql.gz, files.tar, etc.). If the backup directory already contains a prior run's files with these names, Frappe's bench backup may overwrite them mid-stream while a concurrent process is reading, or the wrong data may be uploaded. fetch_latest_backup relied on max(databases, key=os.path.getmtime) to find the newest file — that logic is now bypassed.


files_to_stream = [config_file, db_file]
if with_files:
files_to_stream += [private_file, public_file]

relative_path_to_backup_directory = f"sites/{self.name}/private/backups"
backup_directory_from_root = os.path.join(self.bench.directory, relative_path_to_backup_directory)

for file in files_to_stream:
self.bench.docker_execute(f"mkfifo {file}", subdir=relative_path_to_backup_directory)
Comment on lines +891 to +892
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 FIFOs are never cleaned up on failure

The named pipes created by mkfifo are left behind if any subsequent step raises an exception (e.g., bench execute fails, rclone threads crash). On the next backup run the mkfifo call will fail because the paths already exist, permanently breaking streaming backups for that site until someone manually removes the stale FIFOs.


# get s3 config
bucket, auth, prefix = (
offsite["bucket"],
offsite["auth"],
offsite["path"],
)

# make rclone rcat start listening
import subprocess
import threading

subprocs = []
threads = []
for file in files_to_stream:
fifo_path = os.path.join(backup_directory_from_root, file)

def start_rclone(fifo_path=fifo_path, file=file):
print(f"{file}: opening reading thread... waiting for writer")
fd = open(fifo_path, "rb")
print(f"{file}: fifo has been opened in reader_thread: writer detected")

subproc = subprocess.Popen(
[
"rclone", "rcat",
"--s3-provider", "AWS",
"--s3-access-key-id", auth["ACCESS_KEY"],
"--s3-secret-access-key", auth["SECRET_KEY"],
Comment on lines +919 to +920
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 security AWS credentials exposed in process listing

--s3-access-key-id and --s3-secret-access-key are passed as plain command-line arguments. On Linux, /proc/<pid>/cmdline is world-readable by default, and ps aux shows the full argument list to any user on the host while rclone is running. The existing upload_offsite_backup avoids this entirely by using the boto3 library with in-process credential handling. Prefer writing a temporary rclone config file (with restricted permissions) or supplying credentials via environment variables (RCLONE_S3_ACCESS_KEY_ID / RCLONE_S3_SECRET_ACCESS_KEY) passed through subprocess.Popen(env=...) rather than as CLI flags.

"--s3-region", "ap-south-1",
"--s3-endpoint", f"s3.ap-south-1.amazonaws.com",
f":s3:{bucket}/{prefix}/{file}",
],
stderr=subprocess.PIPE,
stdin=fd,
close_fds=True,
)
fd.close()
subprocs.append(subproc)
ret = subproc.wait()
err = subproc.stderr.read().decode()
print(f"rclone exit: {ret}, error: {err}")
Comment on lines +915 to +933
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 AWS region is hardcoded to ap-south-1

The rclone invocation hard-codes --s3-region ap-south-1 and the matching endpoint. upload_offsite_backup correctly reads auth.get("REGION") and uses it, but the streaming path ignores that field entirely. Any customer whose bucket is in a different region will get silent upload failures or connection errors, and the job will report success because rclone exit codes are only logged with print, not checked.

Comment on lines +929 to +933
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 rclone non-zero exit code is ignored

start_rclone calls subproc.wait(), reads ret, and prints it — but never raises if ret != 0. A failed upload (wrong credentials, bucket policy, network error) is silently treated as success. uploaded_files is then populated for every file in files_to_stream regardless of whether any upload actually succeeded, so the job completes without error while no data was transferred.


t = threading.Thread(target=start_rclone)
threads.append(t)
t.start()

import time
time.sleep(3)
Comment on lines +939 to +940
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Race condition: time.sleep(3) is not a reliable synchronization primitive

The 3-second sleep assumes all rclone reader threads will have opened their FIFO end and started listening before bench backup writes to them. Under high system load, slow process startup, or container scheduling delays this window may be insufficient, causing bench backup to block indefinitely because no reader has opened the pipe yet. The correct approach is to wait for each FIFO to have a reader (e.g., poll os.open with O_NONBLOCK on the write side, or use a threading Event per file).

print("Attempting to open files for writing in main_thread...")
# start writing to fifos
backup_files = self.backup(
with_files,
backup_path_db=os.path.join("/home/frappe/frappe-bench/", relative_path_to_backup_directory, db_file),
backup_path_conf=os.path.join("/home/frappe/frappe-bench/", relative_path_to_backup_directory, config_file),
backup_path_files=os.path.join("/home/frappe/frappe-bench/", relative_path_to_backup_directory, public_file),
backup_path_private_files=os.path.join("/home/frappe/frappe-bench/", relative_path_to_backup_directory, private_file),
)
Comment on lines +943 to +949
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 Hardcoded bench root diverges from FIFO location

The FIFOs are created under backup_directory_from_root = os.path.join(self.bench.directory, ...) (line 889), but the --backup-path-* arguments passed to bench backup are constructed from the hardcoded string /home/frappe/frappe-bench/ instead. On any bench whose directory differs from that literal (e.g., a versioned bench like /home/frappe/benches/bench-40545-000001-u10-ksa), bench backup will try to open a path that has no FIFO at the other end — the write side never connects, bench backup blocks or fails, and the rclone reader threads hang indefinitely in open(fifo_path, "rb"). The job would never complete. The fix is to replace /home/frappe/frappe-bench/ with self.bench.directory to match line 889.

print("Backup files created ✅")

# wait for all rclone threads to finish
for t in threads:
t.join()
print("rclone threads finished executing")
print("Backup uploaded ✅")

uploaded_files = {
file: f"{bucket}/{prefix}/{file}"
for file in files_to_stream
}

return {"backups": backup_files, "offsite": uploaded_files}
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P1 NameError on every non-offsite backup

The new if offsite: block only assigns backup_files and uploaded_files when offsite is truthy. When a plain backup is requested (offsite=None, which is the default), neither variable is ever initialised, so return {"backups": backup_files, "offsite": uploaded_files} raises NameError: name 'backup_files' is not defined. Every standard backup job (no offsite target) will now crash at this line without performing or recording any backup.


@job("Optimize Tables")
Expand Down
Loading