gpustack
diff --git a/‎README.md
Lines changed: 156 additions & 42 deletions b/‎README.md
Lines changed: 156 additions & 42 deletions
diff --git a/‎cmd/gguf-parser/README.md
Lines changed: 3 additions & 1 deletion b/‎cmd/gguf-parser/README.md
Lines changed: 3 additions & 1 deletion
diff --git a/‎cmd/gguf-parser/main.go
Lines changed: 25 additions & 0 deletions b/‎cmd/gguf-parser/main.go
Lines changed: 25 additions & 0 deletions
@@ -36,13 +36,15 @@ GLOBAL OPTIONS:
    --no-kv-offload, --nkvo                              Specify disabling Key-Value offloading, which is used to estimate the usage. Disable Key-Value offloading can reduce the usage of VRAM. (default: false)
    --no-mmap                                            Specify disabling Memory-Mapped using, which is used to estimate the usage. Memory-Mapped can avoid loading the entire model weights into RAM. (default: false)
    --parallel-size value, --parallel value, --np value  Specify the number of parallel sequences to decode, which is used to estimate the usage. (default: 1)
-   --platform-footprint value                           Specify the platform footprint(RAM,VRAM) in MiB, which is used to estimate the NonUMA usage, default is 150,250. Different platform always gets different RAM and VRAM footprints, for example, within CUDA, 'cudaMemGetInfo' would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250")
+   --platform-footprint value                           Specify the platform footprint(RAM,VRAM) of running host in MiB, which is used to estimate the NonUMA usage, default is 150,250. Different platform always gets different RAM and VRAM footprints, for example, within CUDA, 'cudaMemGetInfo' would occupy some RAM and VRAM, see https://stackoverflow.com/questions/64854862/free-memory-occupied-by-cudamemgetinfo. (default: "150,250")
    --split-mode value, --sm value                       Specify how to split the model across multiple devices, which is used to estimate the usage, select from [layer, row, none]. Since gguf-parser always estimates the usage of VRAM, "none" is meaningless here, keep for compatibility. (default: "layer")
    --tensor-split value, --ts value                     Specify the fraction of the model to offload to each device, which is used to estimate the usage, it is a comma-separated list of integer. Since gguf-parser cannot recognize the host GPU devices or RPC servers, must explicitly set --tensor-split to indicate how many devices are used.
    --ubatch-size value, --ub value                      Specify the physical maximum batch size, which is used to estimate the usage. (default: 512)
 
    Load
 
+   --cache-expiration value     Specify the expiration of cache, works with --url/--hf-*/--ms-*/--ol-*. (default: 24h0m0s)
+   --cache-path value           Cache the read result to the path, works with --url/--hf-*/--ms-*/--ol-*. (default: "/Users/thxcode/.cache/gguf-parser")
    --skip-cache                 Skip cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the read result. (default: false)
    --skip-dns-cache             Skip DNS cache, works with --url/--hf-*/--ms-*/--ol-*, default is caching the DNS lookup result. (default: false)
    --skip-proxy                 Skip proxy settings, works with --url/--hf-*/--ms-*/--ol-*, default is respecting the environment variables HTTP_PROXY/HTTPS_PROXY/NO_PROXY. (default: false)
 
@@ -9,6 +9,7 @@ import (
 	"strconv"
 	"strings"
 	"sync"
+	"time"
 
 	"github.com/gpustack/gguf-parser-go/util/anyx"
 	"github.com/gpustack/gguf-parser-go/util/json"
@@ -274,6 +275,22 @@ func main() {
 					"works with --url/--hf-*/--ms-*/--ol-*, " +
 					"default is detecting the range download support.",
 			},
+			&cli.DurationFlag{
+				Destination: &cacheExpiration,
+				Value:       cacheExpiration,
+				Category:    "Load",
+				Name:        "cache-expiration",
+				Usage: "Specify the expiration of cache, " +
+					"works with --url/--hf-*/--ms-*/--ol-*.",
+			},
+			&cli.StringFlag{
+				Destination: &cachePath,
+				Value:       cachePath,
+				Category:    "Load",
+				Name:        "cache-path",
+				Usage: "Cache the read result to the path, " +
+					"works with --url/--hf-*/--ms-*/--ol-*.",
+			},
 			&cli.BoolFlag{
 				Destination: &skipCache,
 				Value:       skipCache,
@@ -552,6 +569,8 @@ var (
 	skipTLSVerify          bool
 	skipDNSCache           bool
 	skipRangDownloadDetect bool
+	cacheExpiration        = 24 * time.Hour
+	cachePath              = DefaultCachePath()
 	skipCache              bool
 	// estimate options
 	ctxSize            = -1
@@ -608,6 +627,12 @@ func mainAction(c *cli.Context) error {
 	if skipRangDownloadDetect {
 		ropts = append(ropts, SkipRangeDownloadDetection())
 	}
+	if cacheExpiration > 0 {
+		ropts = append(ropts, UseCacheExpiration(cacheExpiration))
+	}
+	if cachePath != "" {
+		ropts = append(ropts, UseCachePath(cachePath))
+	}
 	if skipCache {
 		ropts = append(ropts, SkipCache())
 	}