@@ -106,7 +106,7 @@ func NewRunExecutor(tempDir string, homeDir string, sshPort int) (*RunExecutor,
106106 if runtime .GOOS == "linux" {
107107 proc , err := procfs .NewDefaultFS ()
108108 if err != nil {
109- return nil , fmt .Errorf ("failed to initialize procfs: %w" , err )
109+ return nil , fmt .Errorf ("initialize procfs: %w" , err )
110110 }
111111 connectionTracker = connections .NewConnectionTracker (connections.ConnectionTrackerConfig {
112112 Port : uint64 (sshPort ),
@@ -516,7 +516,7 @@ func (ex *RunExecutor) execJob(ctx context.Context, jobLogFile io.Writer) error
516516
517517 err = writeMpiHostfile (ctx , ex .clusterInfo .JobIPs , gpus_per_node_num , mpiHostfilePath )
518518 if err != nil {
519- return err
519+ return fmt . Errorf ( "write MPI hostfile: %w" , err )
520520 }
521521
522522 cmd .Env = envMap .Render ()
@@ -743,7 +743,7 @@ func parseStringId(stringId string) (uint32, error) {
743743 return 0 , err
744744 }
745745 if id < 0 {
746- return 0 , fmt .Errorf ("negative value: %d" , id )
746+ return 0 , fmt .Errorf ("negative id value: %d" , id )
747747 }
748748 return uint32 (id ), nil
749749}
@@ -755,7 +755,7 @@ func parseStringId(stringId string) (uint32, error) {
755755func startCommand (cmd * exec.Cmd ) (* os.File , error ) {
756756 ptm , pts , err := pty .Open ()
757757 if err != nil {
758- return nil , err
758+ return nil , fmt . Errorf ( "open pty: %w" , err )
759759 }
760760 defer func () { _ = pts .Close () }()
761761
@@ -780,13 +780,13 @@ func startCommand(cmd *exec.Cmd) (*os.File, error) {
780780 uid := cmd .SysProcAttr .Credential .Uid
781781 if err := os .Chown (pts .Name (), int (uid ), - 1 ); err != nil {
782782 _ = ptm .Close ()
783- return nil , err
783+ return nil , fmt . Errorf ( "chown pty slave: %w" , err )
784784 }
785785 }
786786
787787 if err := cmd .Start (); err != nil {
788788 _ = ptm .Close ()
789- return nil , err
789+ return nil , fmt . Errorf ( "start command: %w" , err )
790790 }
791791 return ptm , nil
792792}
@@ -830,28 +830,28 @@ func prepareSSHDir(uid int, gid int, homeDir string) (string, error) {
830830 return "" , fmt .Errorf ("not a directory: %s" , sshDir )
831831 }
832832 if err = os .Chmod (sshDir , 0o700 ); err != nil {
833- return "" , err
833+ return "" , fmt . Errorf ( "chmod ssh dir: %w" , err )
834834 }
835835 } else if errors .Is (err , os .ErrNotExist ) {
836836 if err = os .MkdirAll (sshDir , 0o700 ); err != nil {
837- return "" , err
837+ return "" , fmt . Errorf ( "create ssh dir: %w" , err )
838838 }
839839 } else {
840840 return "" , err
841841 }
842842 if err = os .Chown (sshDir , uid , gid ); err != nil {
843- return "" , err
843+ return "" , fmt . Errorf ( "chown ssh dir: %w" , err )
844844 }
845845 return sshDir , nil
846846}
847847
848848func writeMpiHostfile (ctx context.Context , ips []string , gpus_per_node int , path string ) error {
849849 if err := os .MkdirAll (filepath .Dir (path ), 0o755 ); err != nil {
850- return err
850+ return fmt . Errorf ( "create MPI hostfile directory: %w" , err )
851851 }
852852 file , err := os .OpenFile (path , os .O_CREATE | os .O_TRUNC | os .O_WRONLY , 0o644 )
853853 if err != nil {
854- return err
854+ return fmt . Errorf ( "open MPI hostfile: %w" , err )
855855 }
856856 defer file .Close ()
857857 nonEmptyIps := []string {}
@@ -864,7 +864,7 @@ func writeMpiHostfile(ctx context.Context, ips []string, gpus_per_node int, path
864864 for _ , ip := range nonEmptyIps {
865865 line := fmt .Sprintf ("%s slots=%d\n " , ip , gpus_per_node )
866866 if _ , err = file .WriteString (line ); err != nil {
867- return err
867+ return fmt . Errorf ( "write MPI hostfile line: %w" , err )
868868 }
869869 }
870870 } else {
@@ -875,11 +875,11 @@ func writeMpiHostfile(ctx context.Context, ips []string, gpus_per_node int, path
875875
876876func writeDstackProfile (env map [string ]string , pth string ) error {
877877 if err := os .MkdirAll (path .Dir (pth ), 0o755 ); err != nil {
878- return err
878+ return fmt . Errorf ( "create dstack profile directory: %w" , err )
879879 }
880880 file , err := os .OpenFile (pth , os .O_CREATE | os .O_TRUNC | os .O_WRONLY , 0o644 )
881881 if err != nil {
882- return err
882+ return fmt . Errorf ( "open dstack profile: %w" , err )
883883 }
884884 defer file .Close ()
885885 for key , value := range env {
@@ -889,29 +889,29 @@ func writeDstackProfile(env map[string]string, pth string) error {
889889 }
890890 line := fmt .Sprintf ("export %s='%s'\n " , key , strings .ReplaceAll (value , `'` , `'"'"'` ))
891891 if _ , err = file .WriteString (line ); err != nil {
892- return err
892+ return fmt . Errorf ( "write dstack profile: %w" , err )
893893 }
894894 }
895895 if _ , err = file .WriteString ("cd \" $DSTACK_WORKING_DIR\" \n " ); err != nil {
896- return err
896+ return fmt . Errorf ( "write dstack profile: %w" , err )
897897 }
898898 if err = os .Chmod (pth , 0o644 ); err != nil {
899- return err
899+ return fmt . Errorf ( "chmod dstack profile: %w" , err )
900900 }
901901 return nil
902902}
903903
904904func includeDstackProfile (profilePath string , dstackProfilePath string ) error {
905905 file , err := os .OpenFile (profilePath , os .O_CREATE | os .O_APPEND | os .O_WRONLY , 0o644 )
906906 if err != nil {
907- return err
907+ return fmt . Errorf ( "open profile file: %w" , err )
908908 }
909909 defer file .Close ()
910910 if _ , err = file .WriteString (fmt .Sprintf ("\n . '%s'\n " , dstackProfilePath )); err != nil {
911- return err
911+ return fmt . Errorf ( "write profile include: %w" , err )
912912 }
913913 if err = os .Chmod (profilePath , 0o644 ); err != nil {
914- return err
914+ return fmt . Errorf ( "chmod profile file: %w" , err )
915915 }
916916 return nil
917917}
@@ -920,37 +920,37 @@ func configureSSH(private string, public string, ips []string, port int, uid int
920920 privatePath := filepath .Join (sshDir , "dstack_job" )
921921 privateFile , err := os .OpenFile (privatePath , os .O_TRUNC | os .O_WRONLY | os .O_CREATE , 0o600 )
922922 if err != nil {
923- return err
923+ return fmt . Errorf ( "open private key file: %w" , err )
924924 }
925925 defer privateFile .Close ()
926926 if err := os .Chown (privatePath , uid , gid ); err != nil {
927- return err
927+ return fmt . Errorf ( "chown private key: %w" , err )
928928 }
929929 if _ , err := privateFile .WriteString (private ); err != nil {
930- return err
930+ return fmt . Errorf ( "write private key: %w" , err )
931931 }
932932
933933 akPath := filepath .Join (sshDir , "authorized_keys" )
934934 akFile , err := os .OpenFile (akPath , os .O_APPEND | os .O_WRONLY | os .O_CREATE , 0o600 )
935935 if err != nil {
936- return err
936+ return fmt . Errorf ( "open authorized_keys: %w" , err )
937937 }
938938 defer akFile .Close ()
939939 if err := os .Chown (akPath , uid , gid ); err != nil {
940- return err
940+ return fmt . Errorf ( "chown authorized_keys: %w" , err )
941941 }
942942 if _ , err := akFile .WriteString (public ); err != nil {
943- return err
943+ return fmt . Errorf ( "write public key: %w" , err )
944944 }
945945
946946 configPath := filepath .Join (sshDir , "config" )
947947 configFile , err := os .OpenFile (configPath , os .O_APPEND | os .O_WRONLY | os .O_CREATE , 0o600 )
948948 if err != nil {
949- return err
949+ return fmt . Errorf ( "open SSH config: %w" , err )
950950 }
951951 defer configFile .Close ()
952952 if err := os .Chown (configPath , uid , gid ); err != nil {
953- return err
953+ return fmt . Errorf ( "chown SSH config: %w" , err )
954954 }
955955 var configBuffer bytes.Buffer
956956 for _ , ip := range ips {
@@ -961,7 +961,7 @@ func configureSSH(private string, public string, ips []string, port int, uid int
961961 configBuffer .WriteString (fmt .Sprintf (" IdentityFile %s\n " , privatePath ))
962962 }
963963 if _ , err := configFile .Write (configBuffer .Bytes ()); err != nil {
964- return err
964+ return fmt . Errorf ( "write SSH config: %w" , err )
965965 }
966966 return nil
967967}
@@ -973,7 +973,7 @@ func configureSSH(private string, public string, ips []string, port int, uid int
973973func copyAuthorizedKeys (srcPath string , uid int , gid int , dstPath string ) error {
974974 srcFile , err := os .Open (srcPath )
975975 if err != nil {
976- return err
976+ return fmt . Errorf ( "open source authorized_keys: %w" , err )
977977 }
978978 defer srcFile .Close ()
979979
@@ -985,29 +985,29 @@ func copyAuthorizedKeys(srcPath string, uid int, gid int, dstPath string) error
985985 return fmt .Errorf ("is a directory: %s" , dstPath )
986986 }
987987 if err = os .Chmod (dstPath , 0o600 ); err != nil {
988- return err
988+ return fmt . Errorf ( "chmod destination authorized_keys: %w" , err )
989989 }
990990 } else if ! errors .Is (err , os .ErrNotExist ) {
991- return err
991+ return fmt . Errorf ( "stat destination authorized_keys: %w" , err )
992992 }
993993
994994 dstFile , err := os .OpenFile (dstPath , os .O_APPEND | os .O_WRONLY | os .O_CREATE , 0o600 )
995995 if err != nil {
996- return err
996+ return fmt . Errorf ( "open destination authorized_keys: %w" , err )
997997 }
998998 defer dstFile .Close ()
999999
10001000 if dstExists {
10011001 // visually separate our keys from existing ones
10021002 if _ , err := dstFile .WriteString ("\n \n " ); err != nil {
1003- return err
1003+ return fmt . Errorf ( "write separator to authorized_keys: %w" , err )
10041004 }
10051005 }
10061006 if _ , err := io .Copy (dstFile , srcFile ); err != nil {
1007- return err
1007+ return fmt . Errorf ( "copy authorized_keys: %w" , err )
10081008 }
10091009 if err = os .Chown (dstPath , uid , gid ); err != nil {
1010- return err
1010+ return fmt . Errorf ( "chown destination authorized_keys: %w" , err )
10111011 }
10121012
10131013 return nil
0 commit comments