-
Notifications
You must be signed in to change notification settings - Fork 109
fix(ota-hil): set recovery mode before capture logs #903
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 16 commits
d2fea56
195d75e
d02e2b5
9040984
9ba8e91
8866016
b7f0532
a950a82
f85882a
912df00
68cb5cd
1543d27
b75b812
8dda1fb
3725ede
7b12495
2466287
7940035
f92ac94
9a55d7c
7973a70
a8d615b
9fe311c
8a42a30
368513b
b50d887
e27f1cc
19fb770
7e24ebf
7a7c7c5
82f5f84
c17337e
3687bf4
33497fc
0522779
484fa1b
d150f51
0ef98f2
8781765
8b471ea
1b07e90
e83bfa5
1f65059
449a924
0c15f91
ceae6e6
4f8b4b2
5cc74ab
83a80fd
c8df636
32b2f1f
16746e9
0299eda
d972cb2
b96d50f
6c1ae63
8be233d
d7b211a
fe8d64b
a16deaa
2651eef
c377b95
8e02331
55f290b
cb86863
62f59a0
3a9c42e
3cb60a9
22e027c
7a1b966
fe82002
1df5421
6f2a845
73b249a
022ae7b
d2ff64c
269826c
3568b0b
857ec45
7662565
b113e94
bec8b5d
b2b4038
2c89aa4
65dacd1
0e6ac47
3b35464
aef3ffe
2a7ff45
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -37,7 +37,12 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { | |
|
|
||
| info!("Turning off"); | ||
| let device_clone = device.cloned(); | ||
| let ftdi = tokio::task::spawn_blocking(|| -> Result<_, color_eyre::Report> { | ||
| let recovery_state = if recovery { | ||
| OutputState::Low | ||
| } else { | ||
| OutputState::High | ||
| }; | ||
| let ftdi = tokio::task::spawn_blocking(move || -> Result<_, color_eyre::Report> { | ||
| for d in FtdiGpio::list_devices().wrap_err("failed to list ftdi devices")? { | ||
| debug!( | ||
| "ftdi device: desc:{}, serial:{}, vid:{}, pid:{}", | ||
|
|
@@ -46,7 +51,7 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { | |
| } | ||
| let mut ftdi = make_ftdi(device_clone)?; | ||
| ftdi.set_pin(BUTTON_PIN, OutputState::Low)?; | ||
| ftdi.set_pin(RECOVERY_PIN, OutputState::High)?; | ||
| ftdi.set_pin(RECOVERY_PIN, recovery_state)?; | ||
| Ok(ftdi) | ||
| }) | ||
| .await | ||
|
|
@@ -74,7 +79,7 @@ pub async fn reboot(recovery: bool, device: Option<&FtdiId>) -> Result<()> { | |
| }) | ||
| .await | ||
| .wrap_err("task panicked")??; | ||
| tokio::time::sleep(Duration::from_secs(4)).await; | ||
| tokio::time::sleep(Duration::from_secs(10)).await; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You are not using the const here
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. this is only blocknig comment - everything else is great job! |
||
|
|
||
| tokio::task::spawn_blocking(move || ftdi.destroy()) | ||
| .await | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -69,6 +69,11 @@ pub struct Ota { | |
| /// Serial port ID for boot log capture (alternative to --serial-path) | ||
| #[arg(long, group = "serial")] | ||
| serial_id: Option<String>, | ||
|
|
||
| /// Skip NTP time synchronization check before the first reboot (after wipe_overlays). | ||
| /// Time sync will still be checked after reboot and before starting the update. | ||
| #[arg(long, default_value = "false")] | ||
| skip_time_sync_before_reboot: bool, | ||
| } | ||
|
|
||
| #[derive(Debug, Clone, clap::ValueEnum)] | ||
|
|
@@ -94,6 +99,14 @@ impl Ota { | |
| let _start_time = Instant::now(); | ||
| info!("Starting OTA update to version: {}", self.target_version); | ||
|
|
||
| // Create log directory if it doesn't exist | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think we can remove comments like this one, they don't bring much value |
||
| if let Some(log_dir) = self.log_file.parent() { | ||
| tokio::fs::create_dir_all(log_dir).await.wrap_err_with(|| { | ||
| format!("Failed to create log directory: {}", log_dir.display()) | ||
| })?; | ||
| info!("Log directory created/verified: {}", log_dir.display()); | ||
| } | ||
|
|
||
| let session = self.connect_ssh().await.inspect_err(|e| { | ||
| println!("OTA_RESULT=FAILED"); | ||
| println!("OTA_ERROR=SSH_CONNECTION_FAILED: {e}"); | ||
|
|
@@ -105,7 +118,19 @@ impl Ota { | |
| system::wipe_overlays(&session).await.inspect_err(|e| { | ||
| error!("Failed to wipe overlays: {}", e); | ||
| })?; | ||
| info!("Overlays wiped successfully, rebooting device"); | ||
| info!("Overlays wiped successfully"); | ||
|
|
||
| if !self.skip_time_sync_before_reboot { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. why we need this I wonder ?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So, basically the problem was with the worldcoin-key-retrival service. With pearl, teh harware clock is not working so if it NTP server is not synced then worldcoin-attest is failing and also key-retrieval. This should be fixed with new key-retrieval fix, that is why i ketp it as a flag, so we can test older commits, that don't have key-retrival fixed
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I am also not getting it sorry.
That is not true. The clock will synchronize and https requests is going to succeed. And the delay is not big for that
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. By default this is not correct. Because reboot is manually triggered faster than the NTP server syncrhonizes the clock.... |
||
| info!("Waiting for NTP time synchronization before reboot"); | ||
| system::wait_for_time_sync(&session) | ||
| .await | ||
| .inspect_err(|e| { | ||
| error!("Failed to sync time before reboot: {}", e); | ||
| })?; | ||
| info!("NTP time synchronized, rebooting device"); | ||
| } else { | ||
| info!("Skipping NTP time synchronization before reboot (--skip-time-sync-before-reboot flag set)"); | ||
| } | ||
|
|
||
| system::reboot_orb(&session).await?; | ||
| info!("Reboot command sent to Orb device"); | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,30 +20,51 @@ impl Ota { | |
| pub(super) async fn handle_reboot(&self, log_suffix: &str) -> Result<SshWrapper> { | ||
| info!("Waiting for reboot and device to come back online"); | ||
|
|
||
| // Set recovery pin HIGH for 5 seconds to prevent entering recovery mode | ||
| info!("Setting recovery pin HIGH to prevent recovery mode during reboot"); | ||
| // Always wait for SSH to become unreachable before holding the recovery pin. | ||
| info!("Monitoring SSH connection to detect when shutdown actually begins"); | ||
| self.wait_for_ssh_disconnection(Duration::from_secs(30)) | ||
| .await?; | ||
| info!("SSH disconnected - system is shutting down, holding recovery pin"); | ||
|
|
||
| let hold_duration = 20; | ||
|
|
||
| info!( | ||
| "Setting recovery pin HIGH to prevent recovery mode during reboot (hold duration: {}s)", | ||
| hold_duration | ||
| ); | ||
| let set_recovery = SetRecoveryPin { | ||
| state: OutputState::High, | ||
| serial_num: None, | ||
| desc: None, | ||
| duration: 5, | ||
| duration: hold_duration, | ||
| }; | ||
|
|
||
| // Run recovery pin setting in background task | ||
| let recovery_task = tokio::spawn(async move { | ||
| set_recovery | ||
| .run() | ||
| .await | ||
| .wrap_err("failed to set recovery pin") | ||
| }); | ||
|
|
||
| self.capture_boot_logs(log_suffix).await?; | ||
|
|
||
| // Wait for recovery pin task to complete | ||
| recovery_task | ||
| .await | ||
| .wrap_err("recovery pin task panicked")??; | ||
|
|
||
| // Brief delay to allow USB device to be re-enumerated and udev rules to apply | ||
| // after FTDI GPIO is released. The FTDI device detaches/reattaches kernel | ||
| // drivers which causes /dev/ttyUSB* to be recreated. | ||
| tokio::time::sleep(Duration::from_millis(200)).await; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. timings like this might be weak spot and break often - but if works now i am fine
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have tested and look reliable enough. Even though if we don't ahve logs in the future we will now the root of failure and fix it.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Better then make it a constant |
||
|
|
||
| // Spawn boot log capture as a background task so it runs concurrently | ||
| // with SSH reconnection attempts. Extract needed values upfront. | ||
| let platform = self.platform.clone(); | ||
| let log_file = self.log_file.clone(); | ||
| let serial_path = self.get_serial_path().ok(); | ||
| let boot_log_suffix = log_suffix.to_string(); | ||
| let boot_log_task = tokio::spawn(async move { | ||
| Self::capture_boot_logs_static(platform, log_file, serial_path, &boot_log_suffix).await | ||
| }); | ||
|
|
||
| let start_time = Instant::now(); | ||
| let timeout = Duration::from_secs(900); // 15 minutes | ||
| let mut attempt_count = 0; | ||
|
|
@@ -63,7 +84,35 @@ impl Ota { | |
| Ok(session) => match session.test_connection().await { | ||
| Ok(_) => { | ||
| info!("Device is back online and responsive after reboot (attempt {})", attempt_count); | ||
| return Ok(session); | ||
|
|
||
| info!("Waiting for NTP time synchronization after reboot"); | ||
| match super::system::wait_for_time_sync(&session).await { | ||
| Ok(_) => { | ||
| info!("NTP time synchronized successfully"); | ||
|
|
||
| // Wait for boot log capture to finish | ||
| match boot_log_task.await { | ||
| Ok(Ok(())) => { | ||
| info!("Boot log capture completed successfully"); | ||
| } | ||
| Ok(Err(e)) => { | ||
| warn!("Boot log capture failed: {}", e); | ||
| } | ||
| Err(e) => { | ||
| warn!("Boot log capture task panicked: {}", e); | ||
| } | ||
| } | ||
|
|
||
| return Ok(session); | ||
| } | ||
| Err(e) => { | ||
| debug!( | ||
| "Time sync failed on attempt {}: {}", | ||
| attempt_count, e | ||
| ); | ||
| last_error = Some(e); | ||
| } | ||
| } | ||
| } | ||
| Err(e) => { | ||
| debug!( | ||
|
|
@@ -103,27 +152,40 @@ impl Ota { | |
| ); | ||
| } | ||
|
|
||
| /// Captures boot logs from serial port in the background | ||
| #[instrument(skip_all)] | ||
| async fn capture_boot_logs(&self, log_suffix: &str) -> Result<()> { | ||
| let platform_name = format!("{:?}", self.platform).to_lowercase(); | ||
| async fn capture_boot_logs_static( | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. i think it's bad practice to have static in the end of the name
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I will fix! |
||
| platform: super::Platform, | ||
| log_file: std::path::PathBuf, | ||
| serial_path: Option<std::path::PathBuf>, | ||
| log_suffix: &str, | ||
| ) -> Result<()> { | ||
| let platform_name = format!("{:?}", platform).to_lowercase(); | ||
| info!( | ||
| "Starting boot log capture for {} ({})", | ||
| log_suffix, platform_name | ||
| ); | ||
|
|
||
| let boot_log_path = self | ||
| .log_file | ||
| let boot_log_path = log_file | ||
| .parent() | ||
| .unwrap_or_else(|| std::path::Path::new(".")) | ||
| .join(format!("boot_log_{platform_name}_{log_suffix}.txt")); | ||
|
|
||
| let serial_path = match self.get_serial_path() { | ||
| Ok(path) => path, | ||
| Err(e) => { | ||
| warn!( | ||
| "Failed to get serial path: {}. Skipping boot log capture.", | ||
| e | ||
| ); | ||
| // Create parent directory if it doesn't exist | ||
| if let Some(parent) = boot_log_path.parent() | ||
| && let Err(e) = tokio::fs::create_dir_all(parent).await | ||
| { | ||
| warn!( | ||
| "Failed to create directory {}: {}. Boot log capture may fail.", | ||
| parent.display(), | ||
| e | ||
| ); | ||
| } | ||
|
|
||
| let serial_path = match serial_path { | ||
| Some(path) => path, | ||
| None => { | ||
| warn!("No serial path provided. Skipping boot log capture."); | ||
| return Ok(()); | ||
| } | ||
| }; | ||
|
|
@@ -151,21 +213,52 @@ impl Ota { | |
| spawn_serial_reader_task(serial_reader, serial_output_tx); | ||
|
|
||
| let boot_log_fut = async { | ||
| let mut boot_log_content = Vec::new(); | ||
| use tokio::io::AsyncWriteExt; | ||
|
|
||
| // Open file for writing incrementally | ||
| let mut log_file = match tokio::fs::OpenOptions::new() | ||
| .create(true) | ||
| .write(true) | ||
| .truncate(true) | ||
| .open(&boot_log_path) | ||
| .await | ||
| { | ||
| Ok(f) => Some(f), | ||
| Err(e) => { | ||
| warn!( | ||
| "Failed to open boot log file {}: {}. Will continue without writing to disk.", | ||
| boot_log_path.display(), | ||
| e | ||
| ); | ||
| None | ||
| } | ||
| }; | ||
|
|
||
| let mut total_bytes = 0; | ||
| let mut serial_stream = BroadcastStream::new(serial_output_rx); | ||
| // 3-minute timeout for flaky serial connections | ||
| let timeout = Duration::from_secs(180); | ||
|
|
||
| let start_time = Instant::now(); | ||
| let mut found_login_prompt = false; | ||
|
|
||
| while start_time.elapsed() < timeout { | ||
| // Wait indefinitely until login prompt is detected | ||
| loop { | ||
| match tokio::time::timeout(Duration::from_secs(1), serial_stream.next()) | ||
| .await | ||
| { | ||
| Ok(Some(Ok(bytes))) => { | ||
| boot_log_content.extend_from_slice(&bytes); | ||
| // Write to file immediately as data arrives | ||
| if let Some(ref mut file) = log_file { | ||
| if let Err(e) = file.write_all(&bytes).await { | ||
| warn!("Failed to write to boot log file: {}. Continuing capture in memory only.", e); | ||
| log_file = None; | ||
| } else { | ||
| // Flush to ensure data is written to disk immediately | ||
| let _ = file.flush().await; | ||
| total_bytes += bytes.len(); | ||
| } | ||
| } | ||
|
|
||
| // Stop capturing when login prompt is detected | ||
| if let Ok(text) = String::from_utf8(bytes.to_vec()) | ||
| && text.contains(LOGIN_PROMPT_PATTERN) | ||
| { | ||
|
|
@@ -185,36 +278,34 @@ impl Ota { | |
| break; | ||
| } | ||
| Err(_) => { | ||
| // Timeout on reading - continue waiting | ||
| continue; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if start_time.elapsed() >= timeout && !found_login_prompt { | ||
| if found_login_prompt { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. where do we set timeout for this?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The timeout is the basically if the login prompth is fouund or not. If login prompt is never found, then the timeout of ssh will fail later so the process will exit with error |
||
| info!( | ||
| "Boot log capture completed successfully after {:?}", | ||
| start_time.elapsed() | ||
| ); | ||
| } else { | ||
| warn!( | ||
| "Boot log capture timed out after {:?} without finding login prompt. Will proceed with SSH reconnection anyway.", | ||
| timeout | ||
| "Boot log capture ended without detecting login prompt after {:?}", | ||
| start_time.elapsed() | ||
| ); | ||
| } | ||
|
|
||
| if !boot_log_content.is_empty() { | ||
| match tokio::fs::write(&boot_log_path, &boot_log_content).await { | ||
| Ok(_) => { | ||
| info!( | ||
| "Boot log saved to: {} ({} bytes)", | ||
| boot_log_path.display(), | ||
| boot_log_content.len() | ||
| ); | ||
| } | ||
| Err(e) => { | ||
| warn!( | ||
| "Failed to write boot log to {}: {}. Continuing anyway.", | ||
| boot_log_path.display(), | ||
| e | ||
| ); | ||
| } | ||
| } | ||
| } else { | ||
| // Final flush and close | ||
| if let Some(mut file) = log_file { | ||
| let _ = file.flush().await; | ||
| let _ = file.shutdown().await; | ||
| info!( | ||
| "Boot log saved to: {} ({} bytes)", | ||
| boot_log_path.display(), | ||
| total_bytes | ||
| ); | ||
| } else if total_bytes == 0 { | ||
| warn!("No boot log content captured from serial"); | ||
| } | ||
|
|
||
|
|
@@ -235,4 +326,46 @@ impl Ota { | |
|
|
||
| Ok(()) | ||
| } | ||
|
|
||
| /// Wait for SSH connection to become unreachable, indicating shutdown has started | ||
| #[instrument(skip_all)] | ||
| async fn wait_for_ssh_disconnection(&self, timeout: Duration) -> Result<()> { | ||
| let start = Instant::now(); | ||
| let mut attempt = 0; | ||
|
|
||
| loop { | ||
| if start.elapsed() > timeout { | ||
| bail!("SSH did not disconnect within {:?}", timeout); | ||
| } | ||
|
|
||
| attempt += 1; | ||
|
|
||
| // Try to establish connection with a lightweight command | ||
| match self.connect_ssh().await { | ||
| Ok(session) => match session.execute_command("echo").await { | ||
| Ok(_) => { | ||
| // SSH still alive, system hasn't started shutting down yet | ||
| debug!( | ||
| "SSH still responsive (attempt {}), waiting for shutdown...", | ||
| attempt | ||
| ); | ||
| tokio::time::sleep(Duration::from_millis(500)).await; | ||
| } | ||
| Err(_) => { | ||
| // Command failed but connection succeeded - might be shutting down | ||
| info!("SSH connection degraded, shutdown likely in progress"); | ||
| return Ok(()); | ||
| } | ||
| }, | ||
| Err(_) => { | ||
| // Can't connect - shutdown has started | ||
| info!( | ||
| "SSH connection lost after {} attempts, shutdown confirmed", | ||
| attempt | ||
| ); | ||
| return Ok(()); | ||
| } | ||
| } | ||
| } | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
move to the top of the file or maybe have as separate arg to cli so you don't tune it in sources every time?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I will fix!