/* TSP_GPU: This code is a GPU-based heuristic solver for the symmetric Traveling Salesman Problem with up to 110 cities, based on iterative hill climbing with 2-opt local search. Copyright (c) 2011-2020, Texas State University. All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. * Neither the name of Texas State University nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL TEXAS STATE UNIVERSITY BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. Authors: Martin Burtscher and Molly A. O'Neil URL: The latest version of this code is available at https://userweb.cs.txstate.edu/~burtscher/research/TSP_GPU/. Publication: This work is described in detail in the following paper. Molly A. O'Neil, Dan Tamir, and Martin Burtscher. A Parallel GPU Version of the Traveling Salesman Problem. Proceedings of the 2011 International Conference on Parallel and Distributed Processing Techniques and Applications, pp. 348-353. July 2011. */ #include #include #include // random number generation based on SPLASH-2 code #define MULT 1103515245 #define ADD 12345 #define MASK 0x7fffffff #define CIPOW2 128 /* power of 2 and as large as the largest possible input */ #define TOURS110 (65536*2) __device__ volatile unsigned int gblkcnt, gcurr; __device__ volatile unsigned long long best; /******************************************************************************/ /* Kernels to initialize best tour tracking and GPU state. */ /******************************************************************************/ __global__ void ResetKernel() { register int k; k = threadIdx.x + blockIdx.x * blockDim.x; if (k == 0) { best = 0xffffffffffffffffULL; } } __global__ void Reset110Kernel(int blocksTimesThreads) { gblkcnt = 0; gcurr = blocksTimesThreads; } /******************************************************************************/ /* The GPU TSP kernel: for each thread, initializes a first climber and */ /* randomizes a starting tour, then evaluates IHC steps until a local minimum */ /* is found, at which point a new climber is obtained from the worklist. */ /******************************************************************************/ __global__ __launch_bounds__(1024, 1) void TSP110Kernel(int *gdist, int *gresult, int touroffset, int cities, int tours, int *gtours) { register int i, j, change, mini, minj, minchange, randx, citiesm1, citiespad; register int ti, tj, tiplus1, tjplus1, dist_i_iplus1, ti_p1, mytour, from, to, cost; register int *sdist_ti, *sdist_tiplus1, *ltour; register unsigned long long mycost, current; unsigned char tour[110]; // local memory for coalesced accesses, shifted by one entry relative to gresult __shared__ int sdist[110 * 110]; for (i = threadIdx.x; i < cities * cities; i += blockDim.x) { sdist[i] = gdist[i]; } __syncthreads(); citiesm1 = cities - 1; citiespad = (citiesm1 + 33) & (~31); mytour = threadIdx.x + blockIdx.x * blockDim.x; if (mytour < tours) { // Default starting tour for this thread's first climber for (i = 0; i < citiesm1; i++) { tour[i] = i + 1; } tour[citiesm1] = 0; // Randomize the initial tour randx = mytour + touroffset; // use mytour as random seed for (i = 0; i < citiesm1; i++) { randx = (MULT * randx + ADD) & MASK; j = randx % citiesm1; to = tour[i]; tour[i] = tour[j]; tour[j] = to; } do { minchange = 0; ti = 0; // tour[-1] ti_p1= tour[0]; // Evaluate the 2-opt moves: Remove edges (i, i+1) and (j, j+1) and replace with (i, j) and (i+1, j+1) // Evaluate each city i (except first/last) against all cities from i+2 to last city, thus not evaluating // duplicate reversals or adjacent city pairs, which cannot improve tour. for (i = 2; i < cities; i++) { // loop bias = 2, loop from i=1 to i=cities-2 tiplus1 = ti_p1 * cities; dist_i_iplus1 = sdist[ti + ti_p1]; tj = ti_p1 = tour[i-1]; sdist_ti = &sdist[ti]; // Save pointers to i and i+1 rows of distance matrix sdist_tiplus1 = &sdist[tiplus1]; #pragma unroll 8 for (j = i; j < cities; j++) { tjplus1 = tour[j]; // Instead of recomputing new tour length after 2-opt move, just calculate the change from // adding edges (i, j) & (i+1, j+1) and removing edges (i, i+1) & (j, j+1) change = sdist_ti[tj] + sdist_tiplus1[tjplus1] - dist_i_iplus1 - sdist[tj * cities + tjplus1]; tj = tjplus1; // If tour length reduction is new minimum, save the (i, j) coordinates of the 2-opt move if ((minchange > change) && (j < cities)) { minchange = change; mini = i; minj = j; } } ti = tiplus1; } // If this climber found an improved tour, perform the city ordering swap // and continue evaluating with a new IHC step if (minchange < 0) { // new tour is 0 to mini, minj downto mini+1, minj+1 to cities i = mini - 2; j = minj - 1; while (i < j) { to = tour[j]; tour[j] = tour[i]; tour[i] = to; i++; j--; } } // Otherwise, this climber found a local minimum, so compute the tour cost, // record if best solution so far, and get a new climber from the worklist else { cost = 0; from = 0; for (i = 0; i < citiesm1; i++) { to = tour[i]; cost += sdist[from * cities + to]; from = to; } mycost = cost + sdist[from]; mycost = (mycost << 32) + mytour + touroffset; current = best; // Is our local minimum the best solution found? If so, compare-and-swap to // save it as the current best while (mycost < current) { atomicCAS((unsigned long long *)&best, current, mycost); current = best; } if (mycost == current) { ltour = >ours[mytour * citiespad]; for (i = 0; i < citiesm1; i++) { ltour[i] = tour[i]; } } // Get the next climber and randomize a new tour mytour = atomicAdd((int *)&gcurr, 1); if (mytour < tours) { for (i = 0; i < citiesm1; i++) { tour[i] = i + 1; } tour[citiesm1] = 0; randx = mytour + touroffset; for (i = 0; i < citiesm1; i++) { randx = (MULT * randx + ADD) & MASK; j = randx % (citiesm1); to = tour[i]; tour[i] = tour[j]; tour[j] = to; } } } } while (mytour < tours); } __syncthreads(); if (threadIdx.x == 0) { to = gridDim.x - 1; if (to == atomicInc((unsigned int *)&gblkcnt, to)) { mytour = best & 0xffffffff; gresult[0] = best >> 32; gresult[1] = 0; gresult[2] = mytour; mytour %= TOURS110; ltour = >ours[mytour * citiespad]; for (i = 0; i < citiesm1; i++) { gresult[i+3] = ltour[i]; } } } } /******************************************************************************/ /* Function to read the TSP database input file and initialize the distance */ /* matrix. */ /******************************************************************************/ static int readFile(char *filename, int *dist) { register int i, j, ch, cnt, cities; int i1; float i2, i3; register float *posx, *posy; register double dx, dy; register FILE *f; char str[256]; f = fopen(filename, "r+t"); if (f == NULL) {fprintf(stderr, "could not open file %s\n", filename); exit(-1);} ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f); ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f); ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f); ch = getc(f); while ((ch != EOF) && (ch != ':')) ch = getc(f); fscanf(f, "%s\n", str); cities = atoi(str); if (cities == 0) { fprintf(stderr, "%d cities\n", cities); exit(-1); } if (cities >= CIPOW2) { fprintf(stderr, "%d cities is too large\n", cities); exit(-1); } posx = (float *)malloc(sizeof(float) * cities); posy = (float *)malloc(sizeof(float) * cities); if ((posx == NULL) || (posy == NULL)) { fprintf(stderr, "out of memory\n"); exit(-1); } ch = getc(f); while ((ch != EOF) && (ch != '\n')) ch = getc(f); fscanf(f, "%s\n", str); if (strcmp(str, "NODE_COORD_SECTION") != 0) { fprintf(stderr, "wrong file format\n"); exit(-1); } cnt = 0; while (fscanf(f, "%d %f %f\n", &i1, &i2, &i3)) { posx[cnt] = i2; posy[cnt] = i3; cnt++; if (cnt > cities) {fprintf(stderr, "input too long\n"); exit(-1);} if (cnt != i1) {fprintf(stderr, "input line mismatch: expected %d instead of %d\n", cnt, i1); exit(-1);} } if (cnt != cities) { fprintf(stderr, "read %d instead of %d cities\n", cnt, cities); exit(-1); } fscanf(f, "%s", str); if (strcmp(str, "EOF") != 0) { fprintf(stderr, "didn't see 'EOF' at end of file\n"); exit(-1); } fclose(f); for (i = 0; i < cities; i++) { for (j = 0; j < cities; j++) { dx = posx[i] - posx[j]; dy = posy[i] - posy[j]; dist[j * cities + i] = dist[i * cities + j] = (int)(sqrt(dx * dx + dy * dy) + 0.5); } dist[i * cities + i] = 0x3fffffff; // half of maxint } free(posx); free(posy); return cities; } /******************************************************************************/ /* Functions to synchronize GPU threads and check for error status, as well */ /* as to ascertain number of SMs in device and proper architecture version. */ /******************************************************************************/ static void CudaTest(const char* const msg) { cudaError_t e; cudaDeviceSynchronize(); if (cudaSuccess != (e = cudaGetLastError())) { fprintf(stderr, "%s: %d\n", msg, e); fprintf(stderr, "%s\n", cudaGetErrorString(e)); exit(-1); } } static int VerifySystemParameters(int *SMs) { int deviceCount, currentDevice = 0, bestSMArch = 0; int maxComputePerf = 0, maxPerfDevice = 0, SMPerMP; int archCoresSM[3] = { 1, 8, 32 }; cudaDeviceProp deviceProp; cudaGetDeviceCount(&deviceCount); if(deviceCount <= 0) { fprintf(stderr, "There is no device supporting CUDA\n"); exit(-1); } // Find the best SM architecture device while(currentDevice < deviceCount) { cudaGetDeviceProperties(&deviceProp, currentDevice); if(deviceProp.major > 0 && deviceProp.major < 9999) { bestSMArch = max(bestSMArch, deviceProp.major); } currentDevice++; } // Find the best GPU device currentDevice = 0; while(currentDevice < deviceCount) { cudaGetDeviceProperties(&deviceProp, currentDevice); if(deviceProp.major == 9999 && deviceProp.minor == 9999) { SMPerMP = 1; } else if (deviceProp.major <= 2) { SMPerMP = archCoresSM[deviceProp.major]; } else { // SM major > 2 SMPerMP = archCoresSM[2]; } int computePerf = deviceProp.multiProcessorCount * SMPerMP * deviceProp.clockRate; if((deviceProp.major == bestSMArch) && (computePerf > maxComputePerf)) { maxComputePerf = computePerf; maxPerfDevice = currentDevice; } currentDevice++; } cudaGetDeviceProperties(&deviceProp, maxPerfDevice); if(deviceProp.major < 2) { fprintf(stderr, "No device found with compute capability 2.0 or above\n"); exit(-1); } *SMs = deviceProp.multiProcessorCount; return maxPerfDevice; } /******************************************************************************/ /* Run function reads input database and launches the GPU kernels. */ /* Prints to std out: GPU best tour and tour cost, as well as error from */ /* optimal tour read from input file. */ /******************************************************************************/ void run(char *filename, int tours, int SMs) { int *lgdist, tour, blocks, best; int *lgresult, *lgtours, *lscratch; int dist[CIPOW2 * CIPOW2]; int result[3 + CIPOW2]; int cities; cities = readFile(filename, dist); printf("%s: %d tours with %d cities each\n", filename, tours, cities); if (tours < 1) { fprintf(stderr, "tour count must be positive\n"); exit(-1); } if (cudaSuccess != cudaMalloc((void **)&lgtours, ((cities + 32) & (~31)) * min(TOURS110, tours) * sizeof(int))) fprintf(stderr, "could not allocate gtours\n"); CudaTest("couldn't allocate gtours"); if (cudaSuccess != cudaMalloc((void **)&lscratch, ((cities + 32) & (~31)) * min(TOURS110, tours) * sizeof(int))) fprintf(stderr, "could not allocate scratch\n"); CudaTest("couldn't allocate scratch"); if (cudaSuccess != cudaMalloc((void **)&lgresult, sizeof(int) * (cities + 3))) fprintf(stderr, "could not allocate gresult\n"); CudaTest("couldn't allocate gresult"); if (cudaSuccess != cudaMalloc((void **)&lgdist, sizeof(int) * cities * cities)) fprintf(stderr, "could not allocate gdist\n"); CudaTest("couldn't allocate gdist"); if (cudaSuccess != cudaMemcpy(lgdist, dist, sizeof(int) * cities * cities, cudaMemcpyHostToDevice)) fprintf(stderr, "copying of dist to device failed\n"); CudaTest("dist copy to device failed"); ResetKernel<<>>(); best = 0x7fffffff; tour = 0; if (cities <= 110) { blocks = min(tours, TOURS110); while (tours > tour) { Reset110Kernel<<<1, 1>>>(SMs*2*512); TSP110Kernel<<>>(lgdist, lgresult, tour, cities, blocks, lgtours); if (cudaSuccess != cudaMemcpy(result, lgresult, sizeof(int) * 2, cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of result from device failed\n"); CudaTest("result copy from device failed"); if (best > result[0]) { best = result[0]; if (cudaSuccess != cudaMemcpy(result, lgresult, sizeof(int) * (cities + 3), cudaMemcpyDeviceToHost)) fprintf(stderr, "copying of result from device failed\n"); CudaTest("result copy from device failed"); } tour += blocks; blocks = min(tours-tour, TOURS110); } } else { fprintf(stderr, "city count must be <= 110\n"); exit(-1); } printf("GPU min cost = %d\n", best); printf("GPU min tour = %d\n", result[2]); cudaFree(lgtours); cudaFree(lscratch); cudaFree(lgresult); cudaFree(lgdist); } /******************************************************************************/ /* MAIN */ /* Usage: ./TSP_GPU */ /******************************************************************************/ int main(int argc, char *argv[]) { register int climbers, SMs, deviceID; if(argc != 3) { fprintf(stderr, "usage: %s \n", argv[0]); exit(-1); } printf("\nTSP_GPU v1.1 Copyright (c) 2011-2020 Texas State University\n"); deviceID = VerifySystemParameters(&SMs); cudaSetDevice(deviceID); CudaTest("initialization"); cudaFuncSetCacheConfig(TSP110Kernel, cudaFuncCachePreferShared); climbers = atoi(argv[2]); run(argv[1], climbers, SMs); return 0; }