Skip to content

Commit

Permalink
Remove default 1 ps and 1 worker #326 (#327)
Browse files Browse the repository at this point in the history
* Remove default 1 ps and 1 worker #326

* Fix TestUtils

* Fix hanging test

* Add /application_* to .gitignore, bump version in build.gradle
  • Loading branch information
erwa authored Jun 13, 2019
1 parent 626b7e4 commit 9de906f
Show file tree
Hide file tree
Showing 10 changed files with 32 additions and 39 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
.settings
.svn
.vscode/
/application_*
build/
dependency-reduced-pom.xml
log/
Expand Down
2 changes: 1 addition & 1 deletion build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ ext.deps = [

allprojects {
group = "com.linkedin.tony"
project.version = "0.3.14"
project.version = "0.3.15"
}

task sourcesJar(type: Jar) {
Expand Down
2 changes: 1 addition & 1 deletion tony-core/src/main/java/com/linkedin/tony/TonyClient.java
Original file line number Diff line number Diff line change
Expand Up @@ -698,7 +698,7 @@ private static void enforceResourceLimits(Configuration tonyConf) {
// For each jobtype, amount requested is (num X per instance * num instances).
long totalRequested = 0;
for (String jobType : jobTypes) {
int instances = tonyConf.getInt(TonyConfigurationKeys.getInstancesKey(jobType), TonyConfigurationKeys.getDefaultInstances(jobType));
int instances = tonyConf.getInt(TonyConfigurationKeys.getInstancesKey(jobType), 0);
String value = tonyConf.get(TonyConfigurationKeys.getResourceKey(jobType, resource), null);
if (value != null) {
long amountPerTask = resource.equals(Constants.MEMORY) ? Long.parseLong(Utils.parseMemoryString(value)) : Long.parseLong(value);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,16 +179,6 @@ public static String getMaxInstancesKey(String jobName) {
return String.format(TONY_PREFIX + "%s.max-instances", jobName);
}

public static int getDefaultInstances(String jobName) {
switch (jobName) {
case Constants.PS_JOB_NAME:
case Constants.WORKER_JOB_NAME:
return 1;
default:
return 0;
}
}

public static String getResourceKey(String jobName, String resource) {
return String.format(TONY_PREFIX + "%s.%s", jobName, resource);
}
Expand Down
3 changes: 1 addition & 2 deletions tony-core/src/main/java/com/linkedin/tony/util/Utils.java
Original file line number Diff line number Diff line change
Expand Up @@ -353,8 +353,7 @@ public static Map<String, TensorFlowContainerRequest> parseContainerRequests(Con
Map<String, TensorFlowContainerRequest> containerRequests = new HashMap<>();
int priority = 0;
for (String jobName : jobNames) {
int numInstances = conf.getInt(TonyConfigurationKeys.getInstancesKey(jobName),
TonyConfigurationKeys.getDefaultInstances(jobName));
int numInstances = conf.getInt(TonyConfigurationKeys.getInstancesKey(jobName), 0);
String memoryString = conf.get(TonyConfigurationKeys.getResourceKey(jobName, Constants.MEMORY),
TonyConfigurationKeys.DEFAULT_MEMORY);
long memory = Long.parseLong(parseMemoryString(memoryString));
Expand Down
12 changes: 0 additions & 12 deletions tony-core/src/main/resources/tony-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -145,12 +145,6 @@
<value>1</value>
</property>

<property>
<description>Number of parameter servers to request.</description>
<name>tony.ps.instances</name>
<value>1</value>
</property>

<!-- Worker configurations -->
<property>
<description>Timeout, in milliseconds for the user's python processes before forcibly killing them.</description>
Expand All @@ -176,12 +170,6 @@
<value>0</value>
</property>

<property>
<description>Number of workers to request.</description>
<name>tony.worker.instances</name>
<value>1</value>
</property>

<!-- Untracked job type configurations -->
<property>
<description>Job types that we don't track to finish</description>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,9 @@ public void initializeMemberVariables() {
// We don't explicitly declare constants for these, since the configured TensorFlow job names
// are determined at runtime. But we still need default values for them in tony-default.xml.
// So ignore the fact that they exist in tony-default.xml and not in TonyConfigurationKeys.
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getInstancesKey(Constants.PS_JOB_NAME));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME, Constants.MEMORY));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourceKey(Constants.PS_JOB_NAME, Constants.VCORES));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourcesKey(Constants.PS_JOB_NAME));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getInstancesKey(Constants.WORKER_JOB_NAME));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, Constants.MEMORY));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, Constants.VCORES));
xmlPropsToSkipCompare.add(TonyConfigurationKeys.getResourceKey(Constants.WORKER_JOB_NAME, Constants.GPUS));
Expand Down
16 changes: 12 additions & 4 deletions tony-core/src/test/java/com/linkedin/tony/TestTonyE2E.java
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,9 @@ public void testPSWorkerTrainingShouldFailMissedHeartbeat() throws ParseExceptio
"--hdfs_classpath", libPath,
"--python_binary_path", "python",
"--container_env", Constants.SKIP_HADOOP_PATH + "=true",
"--container_env", Constants.TEST_TASK_EXECUTOR_NUM_HB_MISS + "=5"
"--container_env", Constants.TEST_TASK_EXECUTOR_NUM_HB_MISS + "=5",
"--conf", "tony.ps.instances=1",
"--conf", "tony.worker.instances=1",
});
int exitCode = client.start();
Assert.assertNotEquals(exitCode, 0);
Expand Down Expand Up @@ -212,7 +214,9 @@ public void testPSWorkerTrainingShouldFail() throws ParseException, IOException
"--executes", "exit_1.py",
"--hdfs_classpath", libPath,
"--python_binary_path", "python",
"--container_env", Constants.SKIP_HADOOP_PATH + "=true"
"--container_env", Constants.SKIP_HADOOP_PATH + "=true",
"--conf", "tony.ps.instances=1",
"--conf", "tony.worker.instances=1",
});
int exitCode = client.start();
Assert.assertEquals(exitCode, -1);
Expand Down Expand Up @@ -260,7 +264,7 @@ public void testAMCrashTonyShouldFail() throws ParseException, IOException {
public void testAMStopsJobAfterWorker0Killed() throws ParseException, IOException {
client.init(new String[]{"--src_dir", "tony-core/src/test/resources/scripts", "--executes", "exit_0.py",
"--hdfs_classpath", libPath, "--python_binary_path", "python", "--container_env",
Constants.TEST_WORKER_TERMINATED + "=true"});
Constants.TEST_WORKER_TERMINATED + "=true", "--conf", "tony.worker.instances=1"});
int exitCode = client.start();
Assert.assertEquals(exitCode, -1);
}
Expand Down Expand Up @@ -290,7 +294,9 @@ public void testNonChiefWorkerFail() throws ParseException, IOException {
"--executes", "exit_1.py",
"--hdfs_classpath", libPath,
"--python_binary_path", "python",
"--container_env", Constants.SKIP_HADOOP_PATH + "=true"
"--container_env", Constants.SKIP_HADOOP_PATH + "=true",
"--conf", "tony.ps.instances=1",
"--conf", "tony.worker.instances=1"
});
int exitCode = client.start();
Assert.assertEquals(exitCode, -1);
Expand Down Expand Up @@ -363,6 +369,8 @@ public void testTonyClientCallbackHandler() throws IOException, ParseException {
"--shell_env", "ENV_CHECK=ENV_CHECK",
"--container_env", Constants.SKIP_HADOOP_PATH + "=true",
"--python_venv", "tony-core/src/test/resources/test.zip",
"--conf", "tony.ps.instances=1",
"--conf", "tony.worker.instances=1",
});
client.addListener(handler);
int exitCode = client.start();
Expand Down
4 changes: 0 additions & 4 deletions tony-core/src/test/java/com/linkedin/tony/util/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -57,17 +57,13 @@ public void testParseContainerRequests() {
conf.addResource("tony-default.xml");
conf.setInt("tony.worker.instances", 3);
conf.setInt("tony.evaluator.instances", 1);
conf.set("tony.ps.memory", "3g");
conf.setInt("tony.worker.gpus", 1);
conf.setInt("tony.evaluator.vcores", 2);
conf.setInt("tony.chief.gpus", 1);

Map<String, TensorFlowContainerRequest> requests = Utils.parseContainerRequests(conf);
// PS and worker should use default 1 instance
assertEquals(1, requests.get("ps").getNumInstances());
assertEquals(3, requests.get("worker").getNumInstances());
assertEquals(1, requests.get("evaluator").getNumInstances());
assertEquals(3072, requests.get("ps").getMemory());
assertEquals(1, requests.get("worker").getGPU());
assertEquals(2, requests.get("evaluator").getVCores());
// Check default value.
Expand Down
19 changes: 16 additions & 3 deletions tony-examples/mnist-tensorflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,25 @@ zip -r venv.zip venv
TonY only requires YARN, not HDFS. Please see the [open-source documentation](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/SingleCluster.html) on how to set YARN up.


### Disabling security
### Configuration

If your Hadoop cluster is not running with security enabled (e.g.: for local testing), you can disable security by creating a config file as follows:
Below is an example config file to request 2 workers and 1 parameter server. We also assume our Hadoop cluster
does NOT have security enabled (e.g.: for local testing), so we disable TonY's security support.

```
<configuration>
<property>
<name>tony.worker.instances</name>
<value>2</value>
</property>
<property>
<name>tony.worker.memory</name>
<value>4g</value>
</property>
<property>
<name>tony.ps.instances</name>
<value>1</value>
</property>
<property>
<name>tony.application.security.enabled</name>
<value>false</value>
Expand Down Expand Up @@ -83,4 +96,4 @@ java -cp `hadoop classpath`:/path/to/TonY/tony-cli/build/libs/tony-cli-x.x.x-all
--python_binary_path=venv/bin/python # relative path inside venv.zip
```

*We have tested this example with 1 Parameter Server (4GB RAM + 1 vCPU) + 2 Workers (4GB RAM + 1 vCPU)
We have tested this example with 1 Parameter Server (4GB RAM + 1 vCPU) + 2 Workers (4GB RAM + 1 vCPU)

0 comments on commit 9de906f

Please sign in to comment.