-
Notifications
You must be signed in to change notification settings - Fork 236
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improving spawn and exec syscalls #4785
Changes from 5 commits
87f41c9
ed787ce
34676ba
872809a
98c41ab
007704d
41a154e
6429889
ac0c2b0
74fd84b
cf2d2ef
5136613
9136fbc
b8a6620
4672878
0c31771
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
use crate::cost_model::transferred_byte_cycles; | ||
use crate::syscalls::{ | ||
INVALID_FD, MAX_FDS_CREATED, MAX_VMS_SPAWNED, OTHER_END_CLOSED, SPAWN_EXTRA_CYCLES_BASE, | ||
SUCCESS, WAIT_FAILURE, | ||
EXEC_LOAD_ELF_V2_CYCLES_BASE, INVALID_FD, MAX_FDS_CREATED, MAX_VMS_SPAWNED, OTHER_END_CLOSED, | ||
SPAWN_EXTRA_CYCLES_BASE, SUCCESS, WAIT_FAILURE, | ||
}; | ||
use crate::types::MachineContext; | ||
use crate::verify::TransactionScriptsSyscallsGenerator; | ||
|
@@ -15,14 +15,13 @@ use ckb_traits::{CellDataProvider, ExtensionProvider, HeaderProvider}; | |
use ckb_types::core::Cycle; | ||
use ckb_vm::snapshot2::Snapshot2Context; | ||
use ckb_vm::{ | ||
bytes::Bytes, | ||
cost_model::estimate_cycles, | ||
elf::parse_elf, | ||
machine::{CoreMachine, DefaultMachineBuilder, Pause, SupportMachine}, | ||
memory::Memory, | ||
registers::A0, | ||
snapshot2::Snapshot2, | ||
Error, Register, | ||
Error, FlattenedArgsReader, Register, | ||
}; | ||
use std::collections::{BTreeMap, HashMap}; | ||
use std::sync::{Arc, Mutex}; | ||
|
@@ -208,7 +207,7 @@ where | |
if self.states.is_empty() { | ||
// Booting phase, we will need to initialize the first VM. | ||
assert_eq!( | ||
self.boot_vm(&DataPieceId::Program, 0, u64::MAX, &[])?, | ||
self.boot_vm(&DataPieceId::Program, 0, u64::MAX, None)?, | ||
ROOT_VM_ID | ||
); | ||
} | ||
|
@@ -340,6 +339,29 @@ where | |
let messages: Vec<Message> = self.message_box.lock().expect("lock").drain(..).collect(); | ||
for message in messages { | ||
match message { | ||
Message::ExecV2(vm_id, args) => { | ||
let (_, old_machine) = self | ||
.instantiated | ||
.get_mut(&vm_id) | ||
.ok_or_else(|| Error::Unexpected("Unable to find VM Id".to_string()))?; | ||
old_machine | ||
.machine | ||
.add_cycles(EXEC_LOAD_ELF_V2_CYCLES_BASE)?; | ||
let old_cycles = old_machine.machine.cycles(); | ||
let max_cycles = old_machine.machine.max_cycles(); | ||
let (context, mut new_machine) = self.create_dummy_vm(&vm_id)?; | ||
new_machine.set_max_cycles(max_cycles); | ||
new_machine.machine.add_cycles_no_checking(old_cycles)?; | ||
self.load_vm_program( | ||
&context, | ||
&mut new_machine, | ||
&args.data_piece_id, | ||
args.offset, | ||
args.length, | ||
Some((vm_id, args.argc, args.argv)), | ||
)?; | ||
self.instantiated.insert(vm_id, (context, new_machine)); | ||
} | ||
Message::Spawn(vm_id, args) => { | ||
// All fds must belong to the correct owner | ||
if args.fds.iter().any(|fd| self.fds.get(fd) != Some(&vm_id)) { | ||
|
@@ -352,8 +374,12 @@ where | |
machine.machine.set_register(A0, MAX_VMS_SPAWNED as u64); | ||
continue; | ||
} | ||
let spawned_vm_id = | ||
self.boot_vm(&args.data_piece_id, args.offset, args.length, &args.argv)?; | ||
let spawned_vm_id = self.boot_vm( | ||
&args.data_piece_id, | ||
args.offset, | ||
args.length, | ||
Some((vm_id, args.argc, args.argv)), | ||
)?; | ||
// Move passed fds from spawner to spawnee | ||
for fd in &args.fds { | ||
self.fds.insert(*fd, spawned_vm_id); | ||
|
@@ -749,8 +775,12 @@ where | |
data_piece_id: &DataPieceId, | ||
offset: u64, | ||
length: u64, | ||
args: &[Bytes], | ||
args: Option<(u64, u64, u64)>, | ||
) -> Result<VmId, Error> { | ||
let id = self.next_vm_id; | ||
self.next_vm_id += 1; | ||
let (context, mut machine) = self.create_dummy_vm(&id)?; | ||
self.load_vm_program(&context, &mut machine, data_piece_id, offset, length, args)?; | ||
// Newly booted VM will be instantiated by default | ||
while self.instantiated.len() >= MAX_INSTANTIATED_VMS { | ||
// Instantiated is a BTreeMap, first_entry will maintain key order | ||
|
@@ -761,26 +791,40 @@ where | |
.key(); | ||
self.suspend_vm(&id)?; | ||
} | ||
|
||
let id = self.next_vm_id; | ||
self.next_vm_id += 1; | ||
let (context, mut machine) = self.create_dummy_vm(&id)?; | ||
{ | ||
let mut sc = context.snapshot2_context().lock().expect("lock"); | ||
let (program, _) = sc.load_data(data_piece_id, offset, length)?; | ||
let metadata = parse_elf::<u64>(&program, machine.machine.version())?; | ||
let bytes = machine.load_program_with_metadata(&program, &metadata, args)?; | ||
sc.mark_program(&mut machine.machine, &metadata, data_piece_id, offset)?; | ||
machine | ||
.machine | ||
.add_cycles_no_checking(transferred_byte_cycles(bytes))?; | ||
} | ||
self.instantiated.insert(id, (context, machine)); | ||
self.states.insert(id, VmState::Runnable); | ||
|
||
Ok(id) | ||
} | ||
|
||
// Load the program into an empty vm. | ||
fn load_vm_program( | ||
&mut self, | ||
context: &MachineContext<DL>, | ||
machine: &mut Machine, | ||
data_piece_id: &DataPieceId, | ||
offset: u64, | ||
length: u64, | ||
args: Option<(u64, u64, u64)>, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. How about creating a new struct: struct VmArgs {
vm_id: u64,
argc: u64,
argv: u64,
} to replace the anonymouse turple There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the previous internal review, Xuejie suggested that I use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure either. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I did re-check offline communication logs, previously this is represented using the following structure:
Since the enum only had 2 variant: one that has 3 arguments, while the other is empty. I just believe that this 2-value enum, could in fact simply be an I have no opinion whether it is |
||
) -> Result<u64, Error> { | ||
let mut sc = context.snapshot2_context().lock().expect("lock"); | ||
let (program, _) = sc.load_data(data_piece_id, offset, length)?; | ||
let metadata = parse_elf::<u64>(&program, machine.machine.version())?; | ||
let bytes = match args { | ||
Some((vm_id, argc, argv)) => { | ||
let (_, machine_from) = self.ensure_get_instantiated(&vm_id)?; | ||
let argv = FlattenedArgsReader::new(machine_from.machine.memory_mut(), argc, argv); | ||
machine.load_program_with_metadata(&program, &metadata, argv)? | ||
} | ||
None => machine.load_program_with_metadata(&program, &metadata, vec![].into_iter())?, | ||
}; | ||
sc.mark_program(&mut machine.machine, &metadata, data_piece_id, offset)?; | ||
machine | ||
.machine | ||
.add_cycles_no_checking(transferred_byte_cycles(bytes))?; | ||
Ok(bytes) | ||
} | ||
|
||
// Create a new VM instance with syscalls attached | ||
fn create_dummy_vm(&self, id: &VmId) -> Result<(MachineContext<DL>, Machine), Error> { | ||
// The code here looks slightly weird, since I don't want to copy over all syscall | ||
|
Original file line number | Diff line number | Diff line change | ||||||||
---|---|---|---|---|---|---|---|---|---|---|
@@ -0,0 +1,121 @@ | ||||||||||
use crate::syscalls::{ | ||||||||||
Place, Source, EXEC, INDEX_OUT_OF_BOUND, SLICE_OUT_OF_BOUND, SOURCE_ENTRY_MASK, | ||||||||||
SOURCE_GROUP_FLAG, | ||||||||||
}; | ||||||||||
use crate::types::{DataPieceId, ExecV2Args, Message, TxData, VmId}; | ||||||||||
use ckb_traits::{CellDataProvider, ExtensionProvider, HeaderProvider}; | ||||||||||
use ckb_vm::{ | ||||||||||
registers::{A0, A1, A2, A3, A4, A5, A7}, | ||||||||||
snapshot2::Snapshot2Context, | ||||||||||
Error as VMError, Register, SupportMachine, Syscalls, | ||||||||||
}; | ||||||||||
use std::sync::{Arc, Mutex}; | ||||||||||
|
||||||||||
pub struct ExecV2<DL> | ||||||||||
where | ||||||||||
DL: CellDataProvider + HeaderProvider + ExtensionProvider + Send + Sync + Clone + 'static, | ||||||||||
{ | ||||||||||
id: VmId, | ||||||||||
message_box: Arc<Mutex<Vec<Message>>>, | ||||||||||
snapshot2_context: Arc<Mutex<Snapshot2Context<DataPieceId, TxData<DL>>>>, | ||||||||||
} | ||||||||||
|
||||||||||
impl<DL> ExecV2<DL> | ||||||||||
where | ||||||||||
DL: CellDataProvider + HeaderProvider + ExtensionProvider + Send + Sync + Clone + 'static, | ||||||||||
{ | ||||||||||
pub fn new( | ||||||||||
id: VmId, | ||||||||||
message_box: Arc<Mutex<Vec<Message>>>, | ||||||||||
snapshot2_context: Arc<Mutex<Snapshot2Context<DataPieceId, TxData<DL>>>>, | ||||||||||
) -> ExecV2<DL> { | ||||||||||
ExecV2 { | ||||||||||
id, | ||||||||||
message_box, | ||||||||||
snapshot2_context, | ||||||||||
} | ||||||||||
} | ||||||||||
} | ||||||||||
|
||||||||||
impl<Mac, DL> Syscalls<Mac> for ExecV2<DL> | ||||||||||
where | ||||||||||
Mac: SupportMachine, | ||||||||||
DL: CellDataProvider + HeaderProvider + ExtensionProvider + Send + Sync + Clone + 'static, | ||||||||||
{ | ||||||||||
fn initialize(&mut self, _machine: &mut Mac) -> Result<(), VMError> { | ||||||||||
Ok(()) | ||||||||||
} | ||||||||||
|
||||||||||
fn ecall(&mut self, machine: &mut Mac) -> Result<bool, VMError> { | ||||||||||
if machine.registers()[A7].to_u64() != EXEC { | ||||||||||
return Ok(false); | ||||||||||
} | ||||||||||
let index = machine.registers()[A0].to_u64(); | ||||||||||
let mut source = machine.registers()[A1].to_u64(); | ||||||||||
let place = machine.registers()[A2].to_u64(); | ||||||||||
// To keep compatible with the old behavior. When Source is wrong, a | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I do want to ask here: if old behavior is not a concern, what a proper solution here should be? Same question goes for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I suggest removing this part of the compatibility code. |
||||||||||
// Vm internal error should be returned. | ||||||||||
if let Source::Group(_) = Source::parse_from_u64(source)? { | ||||||||||
source = source & SOURCE_ENTRY_MASK | SOURCE_GROUP_FLAG; | ||||||||||
} else { | ||||||||||
source &= SOURCE_ENTRY_MASK; | ||||||||||
} | ||||||||||
// To keep compatible with the old behavior. | ||||||||||
Place::parse_from_u64(place)?; | ||||||||||
|
||||||||||
let data_piece_id = match DataPieceId::try_from((source, index, place)) { | ||||||||||
Ok(id) => id, | ||||||||||
Err(_) => { | ||||||||||
machine.set_register(A0, Mac::REG::from_u8(INDEX_OUT_OF_BOUND)); | ||||||||||
return Ok(true); | ||||||||||
} | ||||||||||
}; | ||||||||||
let bounds = machine.registers()[A3].to_u64(); | ||||||||||
let offset = bounds >> 32; | ||||||||||
let length = bounds as u32 as u64; | ||||||||||
|
||||||||||
// We are fetching the actual cell here for some in-place validation | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is there a chance we can move those validations to the scheduler part, so we only loads the actual program once? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That should be possible, let me simplify it. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the current implementation, it is not possible to load data only once. Because in fact, they are two different calls. The first one is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we move the validation code till later part when the length and the data are fetched in the scheduler part? Total length is only there for verification purposes, and loading per Still we might hit other roadblockers, I just wonder if the code can be simplified in v 2 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I tried to do this, but encountered some compatibility issues, and fixing these compatibility issues may involve modules that this PR does not care about. For example, in DataSource, sc.load_data always returns a Bytes array (possibly an empty array) regardless of offset and length. Therefore, when the developer incorrectly passes in offset and length, in your modification plan, the scheduler will incorrectly execute the elf parsing step and return VMError. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the old exec implementation, we would first do a bounds check and then try to parse elf. But the DataSource does no bounds checking -- if you go out of bounds, it returns empty bytes. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believe we now have a second chance to re-design exec in v2, the old exec implementation can indeed provide inspiration but I don't believe we should strictly follow the old rules exactly. Might be worthwhile to rethink what we really need here There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've been toying with the code here, and this commit fully captures my opinions. Of course it changes current exec's behavior, so out-of-bound offset/length does not result in errors anymore, they only chop the loaded binary as much as possible(e.g., when offset is bigger than data length, an empty buffer will be returned).
So on a personal opinion, I would remove those redundant checks on offset/length in exec's V2 design. We might not be able to do the same thing this time for spawn(since it already is activated on testnet), but I do believe it makes sense to alter exec's design now, and also change spawn's design when we have a chance. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I have made the changes according to your opinions. |
||||||||||
let mut sc = self | ||||||||||
.snapshot2_context | ||||||||||
.lock() | ||||||||||
.map_err(|e| VMError::Unexpected(e.to_string()))?; | ||||||||||
let (_, full_length) = match sc.load_data(&data_piece_id, 0, 0) { | ||||||||||
Ok(val) => val, | ||||||||||
Err(VMError::SnapshotDataLoadError) => { | ||||||||||
// This comes from TxData results in an out of bound error, to | ||||||||||
// mimic current behavior, we would return INDEX_OUT_OF_BOUND error. | ||||||||||
machine.set_register(A0, Mac::REG::from_u8(INDEX_OUT_OF_BOUND)); | ||||||||||
return Ok(true); | ||||||||||
} | ||||||||||
Err(e) => return Err(e), | ||||||||||
}; | ||||||||||
if offset >= full_length { | ||||||||||
machine.set_register(A0, Mac::REG::from_u8(SLICE_OUT_OF_BOUND)); | ||||||||||
return Ok(true); | ||||||||||
} | ||||||||||
if length > 0 { | ||||||||||
let end = offset.checked_add(length).ok_or(VMError::MemOutOfBound)?; | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. both There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
This description is correct, and we can probably remove this redundant check. L92->L96 is used to determine whether the data offset exceeds the total length of the data. Why can it be omitted? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
I think if offset >= full_length {
machine.set_register(A0, Mac::REG::from_u8(SLICE_OUT_OF_BOUND));
return Ok(true);
}
if length > 0 {
let end = offset.checked_add(length).ok_or(VMError::MemOutOfBound)?;
if end > full_length {
machine.set_register(A0, Mac::REG::from_u8(SLICE_OUT_OF_BOUND));
return Ok(true);
}
} is equal to: let end = offset.checked_add(length).ok_or(VMError::MemOutOfBound)?;
if end >= full_length {
machine.set_register(A0, Mac::REG::from_u8(SLICE_OUT_OF_BOUND));
return Ok(true);
} Is above replacement correct? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
We can't combine it. Considering offset == 0 and length == data.len(), we should expect it to succeed, but in the code you gave, it will return SLICE_OUT_OF_BOUND error, |
||||||||||
if end > full_length { | ||||||||||
machine.set_register(A0, Mac::REG::from_u8(SLICE_OUT_OF_BOUND)); | ||||||||||
return Ok(true); | ||||||||||
} | ||||||||||
} | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What should There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It will try to load an empty elf file, which will result in elf parsing errors. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should we handle the check There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please forgive me, I made a mistake earlier, when length == 0, data[offset...] will be read instead of empty bytes; The current code behavior is correct. |
||||||||||
|
||||||||||
let argc = machine.registers()[A4].clone(); | ||||||||||
let argv = machine.registers()[A5].clone(); | ||||||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. L53->L55 is: let index = machine.registers()[A0].to_u64();
let mut source = machine.registers()[A1].to_u64();
let place = machine.registers()[A2].to_u64(); So how about use
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, we don't have to clone the data here. |
||||||||||
self.message_box | ||||||||||
.lock() | ||||||||||
.map_err(|e| VMError::Unexpected(e.to_string()))? | ||||||||||
.push(Message::ExecV2( | ||||||||||
self.id, | ||||||||||
ExecV2Args { | ||||||||||
data_piece_id, | ||||||||||
offset, | ||||||||||
length, | ||||||||||
argc: argc.to_u64(), | ||||||||||
argv: argv.to_u64(), | ||||||||||
}, | ||||||||||
)); | ||||||||||
Err(VMError::Yield) | ||||||||||
} | ||||||||||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Please cap the number of maximum instantiated virtual machines here.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Exec will always remove an old running vm instance and then join a new one, so the maximum number of instantiations will not be triggered.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ah I see, in that sense this code is correct, but can we change the code to the following flow to better illustrate this fact:
The latter insert line can be quite confusing and maybe one day we forgot about this underlying logic.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
In this way of writing,
machine_n_context
borrowsself
as mutable, This will causeself.create_dummy_vm
andself.load_vm_program
to no longer be usableThere was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I see, you are right here. In this sence I recommend the following line as a hint:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
OK. I'll add a comment as well.