diff --git a/panda/src/display/displayRegion.cxx b/panda/src/display/displayRegion.cxx
index 84c852d472d..df0e7a7292b 100644
--- a/panda/src/display/displayRegion.cxx
+++ b/panda/src/display/displayRegion.cxx
@@ -487,7 +487,9 @@ get_screenshot() {
   if (gsg->get_threading_model().get_draw_stage() != current_thread->get_pipeline_stage()) {
     // Ask the engine to do on the draw thread.
     GraphicsEngine *engine = window->get_engine();
-    return engine->do_get_screenshot(this, gsg);
+    return engine->run_on_draw_thread([this] {
+      return get_screenshot();
+    });
   }
 
   // We are on the draw thread.
diff --git a/panda/src/display/graphicsEngine.I b/panda/src/display/graphicsEngine.I
index e9fc42ce44d..ca9142a8a81 100644
--- a/panda/src/display/graphicsEngine.I
+++ b/panda/src/display/graphicsEngine.I
@@ -171,3 +171,53 @@ INLINE void GraphicsEngine::
 dispatch_compute(const LVecBase3i &work_groups, const ShaderAttrib *sattr, GraphicsStateGuardian *gsg) {
   dispatch_compute(work_groups, RenderState::make(sattr), gsg);
 }
+
+#ifndef CPPPARSER
+/**
+ * Waits for the draw thread to become idle, then runs the given function on it.
+ */
+template<class Callable>
+INLINE auto GraphicsEngine::
+run_on_draw_thread(Callable &&callable) -> decltype(callable()) {
+  ReMutexHolder holder(_lock);
+  std::string draw_name = _threading_model.get_draw_name();
+  if (draw_name.empty()) {
+    return std::move(callable)();
+  } else {
+    WindowRenderer *wr = get_window_renderer(draw_name, 0);
+    RenderThread *thread = (RenderThread *)wr;
+    return thread->run_on_thread(std::move(callable));
+  }
+}
+
+/**
+ * Waits for this thread to become idle, then runs the given function on it.
+ */
+template<class Callable>
+INLINE auto GraphicsEngine::RenderThread::
+run_on_thread(Callable &&callable) ->
+  typename std::enable_if<!std::is_void<decltype(callable())>::value, decltype(callable())>::type {
+
+  using ReturnType = decltype(callable());
+  alignas(ReturnType) unsigned char storage[sizeof(ReturnType)];
+
+  run_on_thread([] (RenderThread *data) {
+    new (data->_return_data) ReturnType(std::move(*(Callable *)data->_callback_data)());
+  }, &callable, storage);
+
+  return *(ReturnType *)storage;
+}
+
+/**
+ * Waits for this thread to become idle, then runs the given function on it.
+ */
+template<class Callable>
+INLINE auto GraphicsEngine::RenderThread::
+run_on_thread(Callable &&callable) ->
+  typename std::enable_if<std::is_void<decltype(callable())>::value, decltype(callable())>::type {
+
+  run_on_thread([] (RenderThread *data) {
+    std::move(*(Callable *)data->_callback_data)();
+  }, &callable, nullptr);
+}
+#endif  // CPPPARSER
diff --git a/panda/src/display/graphicsEngine.cxx b/panda/src/display/graphicsEngine.cxx
index 5fab08e1551..474f8e29347 100644
--- a/panda/src/display/graphicsEngine.cxx
+++ b/panda/src/display/graphicsEngine.cxx
@@ -1134,47 +1134,9 @@ flip_frame() {
  */
 bool GraphicsEngine::
 extract_texture_data(Texture *tex, GraphicsStateGuardian *gsg) {
-  ReMutexHolder holder(_lock);
-
-  string draw_name = gsg->get_threading_model().get_draw_name();
-  if (draw_name.empty()) {
-    // A single-threaded environment.  No problem.
+  return run_on_draw_thread([=] () {
     return gsg->extract_texture_data(tex);
-
-  } else {
-    // A multi-threaded environment.  We have to wait until the draw thread
-    // has finished its current task.
-    WindowRenderer *wr = get_window_renderer(draw_name, 0);
-    RenderThread *thread = (RenderThread *)wr;
-    MutexHolder cv_holder(thread->_cv_mutex);
-
-    while (thread->_thread_state != TS_wait) {
-      thread->_cv_done.wait();
-    }
-
-    // Temporarily set this so that it accesses data from the current thread.
-    int pipeline_stage = Thread::get_current_pipeline_stage();
-    int draw_pipeline_stage = thread->get_pipeline_stage();
-    thread->set_pipeline_stage(pipeline_stage);
-
-    // Now that the draw thread is idle, signal it to do the extraction task.
-    thread->_gsg = gsg;
-    thread->_texture = tex;
-    thread->_thread_state = TS_do_extract_texture_data;
-    thread->_cv_mutex.release();
-    thread->_cv_start.notify();
-    thread->_cv_mutex.acquire();
-
-    // Wait for it to finish the extraction.
-    while (thread->_thread_state != TS_wait) {
-      thread->_cv_done.wait();
-    }
-
-    thread->set_pipeline_stage(draw_pipeline_stage);
-    thread->_gsg = nullptr;
-    thread->_texture = nullptr;
-    return thread->_result;
-  }
+  });
 }
 
 /**
@@ -1189,56 +1151,13 @@ extract_texture_data(Texture *tex, GraphicsStateGuardian *gsg) {
  */
 vector_uchar GraphicsEngine::
 extract_shader_buffer_data(ShaderBuffer *buffer, GraphicsStateGuardian *gsg) {
-  ReMutexHolder holder(_lock);
-
-  string draw_name = gsg->get_threading_model().get_draw_name();
-  if (draw_name.empty()) {
-    // A single-threaded environment.  No problem.
+  return run_on_draw_thread([=] () {
     vector_uchar data;
     if (!gsg->extract_shader_buffer_data(buffer, data)) {
       data.clear();
     }
     return data;
-  }
-
-  // A multi-threaded environment.  We have to wait until the draw thread
-  // has finished its current task.
-  WindowRenderer *wr = get_window_renderer(draw_name, 0);
-  RenderThread *thread = (RenderThread *)wr;
-  MutexHolder cv_holder(thread->_cv_mutex);
-
-  while (thread->_thread_state != TS_wait) {
-    thread->_cv_done.wait();
-  }
-
-  // Temporarily set this so that it accesses data from the current thread.
-  int pipeline_stage = Thread::get_current_pipeline_stage();
-  int draw_pipeline_stage = thread->get_pipeline_stage();
-  thread->set_pipeline_stage(pipeline_stage);
-
-  // Now that the draw thread is idle, signal it to do the extraction task.
-  vector_uchar data;
-  thread->_gsg = gsg;
-  thread->_buffer = buffer;
-  thread->_buffer_result = &data;
-  thread->_thread_state = TS_do_extract_shader_buffer_data;
-  thread->_cv_mutex.release();
-  thread->_cv_start.notify();
-  thread->_cv_mutex.acquire();
-
-  // Wait for it to finish the extraction.
-  while (thread->_thread_state != TS_wait) {
-    thread->_cv_done.wait();
-  }
-
-  thread->set_pipeline_stage(draw_pipeline_stage);
-  thread->_gsg = nullptr;
-  thread->_buffer = nullptr;
-  thread->_buffer_result = nullptr;
-  if (!thread->_result) {
-    data.clear();
-  }
-  return data;
+  });
 }
 
 /**
@@ -1263,50 +1182,12 @@ dispatch_compute(const LVecBase3i &work_groups, const RenderState *state, Graphi
   nassertv(shader != nullptr);
   nassertv(gsg != nullptr);
 
-  ReMutexHolder holder(_lock);
-
-  string draw_name = gsg->get_threading_model().get_draw_name();
-  if (draw_name.empty()) {
-    // A single-threaded environment.  No problem.
+  run_on_draw_thread([=] () {
     gsg->push_group_marker(std::string("Compute ") + shader->get_filename(Shader::ST_compute).get_basename());
     gsg->set_state_and_transform(state, TransformState::make_identity());
     gsg->dispatch_compute(work_groups[0], work_groups[1], work_groups[2]);
     gsg->pop_group_marker();
-
-  } else {
-    // A multi-threaded environment.  We have to wait until the draw thread
-    // has finished its current task.
-    WindowRenderer *wr = get_window_renderer(draw_name, 0);
-    RenderThread *thread = (RenderThread *)wr;
-    MutexHolder cv_holder(thread->_cv_mutex);
-
-    while (thread->_thread_state != TS_wait) {
-      thread->_cv_done.wait();
-    }
-
-    // Temporarily set this so that it accesses data from the current thread.
-    int pipeline_stage = Thread::get_current_pipeline_stage();
-    int draw_pipeline_stage = thread->get_pipeline_stage();
-    thread->set_pipeline_stage(pipeline_stage);
-
-    // Now that the draw thread is idle, signal it to do the compute task.
-    thread->_gsg = gsg;
-    thread->_state = state;
-    thread->_work_groups = work_groups;
-    thread->_thread_state = TS_do_compute;
-    thread->_cv_mutex.release();
-    thread->_cv_start.notify();
-    thread->_cv_mutex.acquire();
-
-    // Wait for it to finish the compute task.
-    while (thread->_thread_state != TS_wait) {
-      thread->_cv_done.wait();
-    }
-
-    thread->set_pipeline_stage(draw_pipeline_stage);
-    thread->_gsg = nullptr;
-    thread->_state = nullptr;
-  }
+  });
 }
 
 /**
@@ -1342,43 +1223,6 @@ texture_uploaded(Texture *tex) {
 // Usually only called by DisplayRegion::do_cull.
 }
 
-/**
- * Called by DisplayRegion::do_get_screenshot
- */
-PT(Texture) GraphicsEngine::
-do_get_screenshot(DisplayRegion *region, GraphicsStateGuardian *gsg) {
-  // A multi-threaded environment.  We have to wait until the draw thread
-  // has finished its current task.
-
-  ReMutexHolder holder(_lock);
-
-  const std::string &draw_name = gsg->get_threading_model().get_draw_name();
-  WindowRenderer *wr = get_window_renderer(draw_name, 0);
-  RenderThread *thread = (RenderThread *)wr;
-  MutexHolder cv_holder(thread->_cv_mutex);
-
-  while (thread->_thread_state != TS_wait) {
-    thread->_cv_done.wait();
-  }
-
-  // Now that the draw thread is idle, signal it to do the extraction task.
-  thread->_region = region;
-  thread->_thread_state = TS_do_screenshot;
-  thread->_cv_mutex.release();
-  thread->_cv_start.notify();
-  thread->_cv_mutex.acquire();
-
-  // Wait for it to finish the extraction.
-  while (thread->_thread_state != TS_wait) {
-    thread->_cv_done.wait();
-  }
-
-  PT(Texture) tex = std::move(thread->_texture);
-  thread->_region = nullptr;
-  thread->_texture = nullptr;
-  return tex;
-}
-
 /**
  * Fires off a cull traversal using the indicated camera.
  */
@@ -2867,31 +2711,9 @@ thread_main() {
       do_pending(_engine, current_thread);
       break;
 
-    case TS_do_compute:
-      nassertd(_gsg != nullptr && _state != nullptr) break;
-      {
-        const ShaderAttrib *sattr;
-        _state->get_attrib(sattr);
-        _gsg->push_group_marker(std::string("Compute ") + sattr->get_shader()->get_filename(Shader::ST_compute).get_basename());
-        _gsg->set_state_and_transform(_state, TransformState::make_identity());
-        _gsg->dispatch_compute(_work_groups[0], _work_groups[1], _work_groups[2]);
-        _gsg->pop_group_marker();
-      }
-      break;
-
-    case TS_do_extract_texture_data:
-      nassertd(_gsg != nullptr && _texture != nullptr) break;
-      _result = _gsg->extract_texture_data(_texture);
-      break;
-
-    case TS_do_extract_shader_buffer_data:
-      nassertd(_gsg != nullptr && _texture != nullptr) break;
-      _result = _gsg->extract_shader_buffer_data(_buffer, *_buffer_result);
-      break;
-
-    case TS_do_screenshot:
-      nassertd(_region != nullptr) break;
-      _texture = _region->get_screenshot();
+    case TS_callback:
+      nassertd(_callback != nullptr) break;
+      _callback(this);
       break;
 
     case TS_terminate:
@@ -2916,3 +2738,39 @@ thread_main() {
     }
   }
 }
+
+/**
+ * Waits for this thread to become idle, then runs the given function on it.
+ */
+void GraphicsEngine::RenderThread::
+run_on_thread(Callback *callback, void *callback_data, void *return_data) {
+  MutexHolder cv_holder(_cv_mutex);
+
+  while (_thread_state != TS_wait) {
+    _cv_done.wait();
+  }
+
+  // Temporarily set this so that it accesses data from the current thread.
+  int pipeline_stage = Thread::get_current_pipeline_stage();
+  int thread_pipeline_stage = get_pipeline_stage();
+  set_pipeline_stage(pipeline_stage);
+
+  // Now that the draw thread is idle, signal it to run the callback.
+  _callback = callback;
+  _callback_data = callback_data;
+  _return_data = return_data;
+  _thread_state = TS_callback;
+  _cv_mutex.release();
+  _cv_start.notify();
+  _cv_mutex.acquire();
+
+  // Wait for it to finish the job.
+  while (_thread_state != TS_wait) {
+    _cv_done.wait();
+  }
+
+  set_pipeline_stage(thread_pipeline_stage);
+  _callback = nullptr;
+  _callback_data = nullptr;
+  _return_data = nullptr;
+}
diff --git a/panda/src/display/graphicsEngine.h b/panda/src/display/graphicsEngine.h
index 16034a123a5..88874f8e509 100644
--- a/panda/src/display/graphicsEngine.h
+++ b/panda/src/display/graphicsEngine.h
@@ -35,6 +35,8 @@
 #include "renderState.h"
 #include "clockObject.h"
 
+#include <type_traits>
+
 class Pipeline;
 class DisplayRegion;
 class GraphicsPipe;
@@ -130,16 +132,17 @@ class EXPCL_PANDA_DISPLAY GraphicsEngine : public ReferenceCount {
     TS_do_flip,
     TS_do_release,
     TS_do_windows,
-    TS_do_compute,
-    TS_do_extract_texture_data,
-    TS_do_extract_shader_buffer_data,
-    TS_do_screenshot,
+    TS_callback,
     TS_terminate,
     TS_done
   };
 
   void texture_uploaded(Texture *tex);
-  PT(Texture) do_get_screenshot(DisplayRegion *region, GraphicsStateGuardian *gsg);
+
+#ifndef CPPPARSER
+  template<class Callable>
+  INLINE auto run_on_draw_thread(Callable &&callable) -> decltype(callable());
+#endif
 
 public:
   static void do_cull(CullHandler *cull_handler, SceneSetup *scene_setup,
@@ -306,21 +309,31 @@ class EXPCL_PANDA_DISPLAY GraphicsEngine : public ReferenceCount {
     RenderThread(const std::string &name, GraphicsEngine *engine);
     virtual void thread_main();
 
+    typedef void Callback(RenderThread *thread);
+    void run_on_thread(Callback *callback,
+                       void *callback_data = nullptr,
+                       void *return_data = nullptr);
+
+#ifndef CPPPARSER
+    template<class Callable>
+    INLINE auto run_on_thread(Callable &&callable) ->
+      typename std::enable_if<!std::is_void<decltype(callable())>::value, decltype(callable())>::type;
+
+    template<class Callable>
+    INLINE auto run_on_thread(Callable &&callable) ->
+      typename std::enable_if<std::is_void<decltype(callable())>::value, decltype(callable())>::type;
+#endif
+
     GraphicsEngine *_engine;
     Mutex _cv_mutex;
     ConditionVar _cv_start;
     ConditionVar _cv_done;
     ThreadState _thread_state;
 
-    // These are stored for extract_texture_data and dispatch_compute.
-    GraphicsStateGuardian *_gsg;
-    PT(Texture) _texture;
-    ShaderBuffer *_buffer;
-    vector_uchar *_buffer_result;
-    const RenderState *_state;
-    DisplayRegion *_region;
-    LVecBase3i _work_groups;
-    bool _result;
+    // Used for TS_callback.
+    Callback *_callback;
+    void *_callback_data;
+    void *_return_data;
   };
 
   WindowRenderer *get_window_renderer(const std::string &name, int pipeline_stage);
diff --git a/panda/src/display/graphicsStateGuardian.cxx b/panda/src/display/graphicsStateGuardian.cxx
index 9be45d34eff..80c02f88bb7 100644
--- a/panda/src/display/graphicsStateGuardian.cxx
+++ b/panda/src/display/graphicsStateGuardian.cxx
@@ -572,6 +572,23 @@ update_texture(TextureContext *, bool) {
   return true;
 }
 
+/**
+ * Ensures that the current Texture data is refreshed onto the GSG.  This
+ * means updating the texture properties and/or re-uploading the texture
+ * image, if necessary.  This should only be called within the draw thread.
+ *
+ * If force is true, this function will not return until the texture has been
+ * fully uploaded.  If force is false, the function may choose to upload a
+ * simple version of the texture instead, if the texture is not fully resident
+ * (and if get_incomplete_render() is true).
+ */
+bool GraphicsStateGuardian::
+update_texture(TextureContext *tc, bool force, CompletionToken token) {
+  bool result = update_texture(tc, force);
+  token.complete(result);
+  return result;
+}
+
 /**
  * Frees the resources previously allocated via a call to prepare_texture(),
  * including deleting the TextureContext itself, if it is non-NULL.
diff --git a/panda/src/display/graphicsStateGuardian.h b/panda/src/display/graphicsStateGuardian.h
index 3ed65b8e31c..039572a7afd 100644
--- a/panda/src/display/graphicsStateGuardian.h
+++ b/panda/src/display/graphicsStateGuardian.h
@@ -292,6 +292,7 @@ class EXPCL_PANDA_DISPLAY GraphicsStateGuardian : public GraphicsStateGuardianBa
 public:
   virtual TextureContext *prepare_texture(Texture *tex);
   virtual bool update_texture(TextureContext *tc, bool force);
+  virtual bool update_texture(TextureContext *tc, bool force, CompletionToken token);
   virtual void release_texture(TextureContext *tc);
   virtual void release_textures(const pvector<TextureContext *> &contexts);
   virtual bool extract_texture_data(Texture *tex);
diff --git a/panda/src/event/asyncFuture.cxx b/panda/src/event/asyncFuture.cxx
index d8b8b91d979..a6aa341c07b 100644
--- a/panda/src/event/asyncFuture.cxx
+++ b/panda/src/event/asyncFuture.cxx
@@ -389,6 +389,17 @@ wake_task(AsyncTask *task) {
   }
 }
 
+/**
+ * Internal callback called when a CompletionToken created from this future
+ * completes.
+ */
+void AsyncFuture::
+token_callback(Completable::Data *data, bool success) {
+  AsyncFuture *future = (AsyncFuture *)data;
+  future->set_result(EventParameter(success));
+  unref_delete(future);
+}
+
 /**
  * @see AsyncFuture::gather
  */
diff --git a/panda/src/event/asyncFuture.h b/panda/src/event/asyncFuture.h
index acb6b1020fb..f9f71472f8d 100644
--- a/panda/src/event/asyncFuture.h
+++ b/panda/src/event/asyncFuture.h
@@ -20,6 +20,7 @@
 #include "eventParameter.h"
 #include "patomic.h"
 #include "small_vector.h"
+#include "completionToken.h"
 
 class AsyncTaskManager;
 class AsyncTask;
@@ -58,7 +59,7 @@ class AsyncTask;
  *
  * @since 1.10.0
  */
-class EXPCL_PANDA_EVENT AsyncFuture : public TypedReferenceCount {
+class EXPCL_PANDA_EVENT AsyncFuture : public TypedReferenceCount, protected Completable::Data {
 PUBLISHED:
   INLINE AsyncFuture();
   virtual ~AsyncFuture();
@@ -109,6 +110,8 @@ class EXPCL_PANDA_EVENT AsyncFuture : public TypedReferenceCount {
 private:
   void wake_task(AsyncTask *task);
 
+  static void token_callback(Completable::Data *, bool success);
+
 protected:
   enum FutureState : patomic_unsigned_lock_free::value_type {
     // Pending states
@@ -136,6 +139,7 @@ class EXPCL_PANDA_EVENT AsyncFuture : public TypedReferenceCount {
 
   friend class AsyncGatheringFuture;
   friend class AsyncTaskChain;
+  friend class CompletionToken;
   friend class PythonTask;
 
 public:
@@ -199,6 +203,33 @@ class EXPCL_PANDA_EVENT AsyncGatheringFuture final : public AsyncFuture {
   static TypeHandle _type_handle;
 };
 
+#ifndef CPPPARSER
+// Allow passing a future into a method accepting a CompletionToken.
+template<>
+INLINE CompletionToken::
+CompletionToken(AsyncFuture *future) {
+  if (future != nullptr) {
+    future->ref();
+    _callback._data = future;
+    if (_callback._data->_function == nullptr) {
+      _callback._data->_function = &AsyncFuture::token_callback;
+    }
+  }
+}
+
+template<>
+INLINE CompletionToken::
+CompletionToken(PT(AsyncFuture) future) {
+  if (future != nullptr) {
+    _callback._data = future;
+    if (_callback._data->_function == nullptr) {
+      _callback._data->_function = &AsyncFuture::token_callback;
+    }
+    future.cheat() = nullptr;
+  }
+}
+#endif
+
 #include "asyncFuture.I"
 
 #endif // !ASYNCFUTURE_H
diff --git a/panda/src/gles2gsg/gles2gsg.h b/panda/src/gles2gsg/gles2gsg.h
index c43beded29d..feb30b41a05 100644
--- a/panda/src/gles2gsg/gles2gsg.h
+++ b/panda/src/gles2gsg/gles2gsg.h
@@ -148,6 +148,10 @@ typedef char GLchar;
 #define GL_TRANSFORM_FEEDBACK_BARRIER_BIT 0x800
 #define GL_ATOMIC_COUNTER_BARRIER_BIT 0x1000
 #define GL_SHADER_STORAGE_BARRIER_BIT 0x2000
+#define GL_MAP_INVALIDATE_RANGE_BIT 0x0004
+#define GL_MAP_INVALIDATE_BUFFER_BIT 0x0008
+#define GL_MAP_FLUSH_EXPLICIT_BIT 0x0010
+#define GL_MAP_UNSYNCHRONIZED_BIT 0x0020
 #define GL_HALF_FLOAT 0x140B
 #define GL_COLOR 0x1800
 #define GL_DEPTH 0x1801
diff --git a/panda/src/glstuff/glGraphicsBuffer_src.cxx b/panda/src/glstuff/glGraphicsBuffer_src.cxx
index dde857eb17f..a97818448d5 100644
--- a/panda/src/glstuff/glGraphicsBuffer_src.cxx
+++ b/panda/src/glstuff/glGraphicsBuffer_src.cxx
@@ -281,7 +281,7 @@ begin_frame(FrameMode mode, Thread *current_thread) {
       CLP(GraphicsStateGuardian) *glgsg = (CLP(GraphicsStateGuardian) *)_gsg.p();
 
       for (CLP(TextureContext) *gtc : _texture_contexts) {
-        if (gtc->needs_barrier(GL_FRAMEBUFFER_BARRIER_BIT)) {
+        if (gtc->needs_barrier(GL_FRAMEBUFFER_BARRIER_BIT, true)) {
           glgsg->issue_memory_barrier(GL_FRAMEBUFFER_BARRIER_BIT);
           // If we've done it for one, we've done it for all.
           break;
@@ -1973,7 +1973,7 @@ resolve_multisamples() {
     // Issue memory barriers as necessary to make sure that the texture memory
     // is synchronized before we blit to it.
     for (CLP(TextureContext) *gtc : _texture_contexts) {
-      if (gtc->needs_barrier(GL_FRAMEBUFFER_BARRIER_BIT)) {
+      if (gtc->needs_barrier(GL_FRAMEBUFFER_BARRIER_BIT, true)) {
         glgsg->issue_memory_barrier(GL_FRAMEBUFFER_BARRIER_BIT);
         // If we've done it for one, we've done it for all.
         break;
diff --git a/panda/src/glstuff/glGraphicsStateGuardian_src.cxx b/panda/src/glstuff/glGraphicsStateGuardian_src.cxx
index 9988da71d07..b29387463b9 100644
--- a/panda/src/glstuff/glGraphicsStateGuardian_src.cxx
+++ b/panda/src/glstuff/glGraphicsStateGuardian_src.cxx
@@ -68,6 +68,7 @@
 #include "shaderGenerator.h"
 #include "samplerState.h"
 #include "displayInformation.h"
+#include "completionCounter.h"
 
 #if defined(HAVE_CG) && !defined(OPENGLES)
 #include <Cg/cgGL.h>
@@ -97,6 +98,10 @@ PStatCollector CLP(GraphicsStateGuardian)::_check_residency_pcollector("*:PStats
 PStatCollector CLP(GraphicsStateGuardian)::_wait_fence_pcollector("Wait:Fence");
 PStatCollector CLP(GraphicsStateGuardian)::_copy_texture_finish_pcollector("Draw:Copy texture:Finish");
 
+static PStatCollector _create_texture_storage_pcollector("Draw:Transfer data:Texture:Create Storage");
+static PStatCollector _create_map_pbo_pcollector("Draw:Transfer data:Texture:Create/Map PBO");
+static PStatCollector _load_texture_copy_pcollector("Draw:Transfer data:Texture:Copy/Convert");
+
 #if defined(HAVE_CG) && !defined(OPENGLES)
 AtomicAdjust::Integer CLP(GraphicsStateGuardian)::_num_gsgs_with_cg_contexts = 0;
 small_vector<CGcontext> CLP(GraphicsStateGuardian)::_destroyed_cg_contexts;
@@ -326,6 +331,23 @@ uchar_l_to_rgb(unsigned char *dest, const unsigned char *source,
   }
 }
 
+/**
+ * Recopies the given array of pixels, converting from luminance to RGBA
+ * arrangement.
+ */
+static void
+uchar_l_to_rgba(unsigned char *dest, const unsigned char *source,
+                int num_pixels) {
+  for (int i = 0; i < num_pixels; i++) {
+    dest[0] = source[0];
+    dest[1] = source[0];
+    dest[2] = source[0];
+    dest[3] = 1;
+    dest += 4;
+    source += 1;
+  }
+}
+
 /**
  * Recopies the given array of pixels, converting from BGRA to RGBA
  * arrangement.
@@ -426,78 +448,176 @@ ushort_la_to_rgba(unsigned short *dest, const unsigned short *source,
 }
 
 /**
- * Reverses the order of the components within the image, to convert (for
- * instance) GL_BGR to GL_RGB. Returns the byte pointer representing the
- * converted image, or the original image if it is unchanged.
- *
- * new_image must be supplied; it is the PTA_uchar that will be used to hold
- * the converted image if required.  It will be modified only if the
- * conversion is necessary, in which case the data will be stored there, and
- * this pointer will be returned.  If the conversion is not necessary, this
- * pointer will be left unchanged.
+ * Determines the number of components of the given external format.
  */
-static const unsigned char *
-fix_component_ordering(PTA_uchar &new_image,
-                       const unsigned char *orig_image, size_t orig_image_size,
-                       GLenum external_format, Texture *tex) {
-  const unsigned char *result = orig_image;
+static int
+get_external_format_components(GLint external_format) {
+  switch (external_format) {
+#ifndef OPENGLES_1
+  case GL_RED:
+  case GL_RED_INTEGER:
+  case GL_GREEN:
+  case GL_BLUE:
+#endif
+  case GL_ALPHA:
+  case GL_LUMINANCE:
+#ifndef OPENGLES
+  case GL_GREEN_INTEGER:
+  case GL_BLUE_INTEGER:
+  case GL_ALPHA_INTEGER:
+#endif
+    return 1;
+
+#ifndef OPENGLES_1
+  case GL_RG:
+  case GL_RG_INTEGER:
+#endif
+  case GL_LUMINANCE_ALPHA:
+    return 2;
+
+  case GL_RGB:
+#ifndef OPENGLES_1
+  case GL_RGB_INTEGER:
+#endif
+#ifndef OPENGLES
+  case GL_BGR:
+  case GL_BGR_INTEGER:
+#endif
+    return 3;
 
+  case GL_RGBA:
+#ifndef OPENGLES_1
+  case GL_RGBA_INTEGER:
+#endif
+  case GL_BGRA:
+#ifndef OPENGLES
+  case GL_BGRA_INTEGER:
+#endif
+    return 4;
+
+  default:
+    GLCAT.error()
+      << "Unknown external format 0x" << std::hex << external_format
+      << std::dec << "\n";
+    return 4;
+  }
+}
+
+/**
+ * Copies the image with optional conversion.
+ */
+static void
+copy_image(unsigned char *new_image, const unsigned char *orig_image,
+           size_t orig_image_size, GLint external_format, int num_components,
+           int component_width) {
   switch (external_format) {
+#ifndef OPENGLES_1
+  case GL_RED:
+  case GL_RED_INTEGER:
+  case GL_GREEN:
+  case GL_BLUE:
+#endif
+  case GL_ALPHA:
+  case GL_LUMINANCE:
+#ifndef OPENGLES
+  case GL_GREEN_INTEGER:
+  case GL_BLUE_INTEGER:
+  case GL_ALPHA_INTEGER:
+#endif
+    if (num_components == 1) {
+      memcpy(new_image, orig_image, orig_image_size);
+      return;
+    }
+    break;
+
+#ifndef OPENGLES_1
+  case GL_RG:
+  case GL_RG_INTEGER:
+#endif
+  case GL_LUMINANCE_ALPHA:
+    if (num_components == 2) {
+      memcpy(new_image, orig_image, orig_image_size);
+      return;
+    }
+    break;
+
+#ifndef OPENGLES_1
+#ifndef OPENGLES
+  case GL_BGR:
+#endif
+  case GL_RGB_INTEGER:
+    if (num_components == 3) {
+      memcpy(new_image, orig_image, orig_image_size);
+      return;
+    }
+    if (num_components == 1 && component_width == 1) {
+      uchar_l_to_rgb(new_image, orig_image, orig_image_size);
+      return;
+    }
+    break;
+#endif
+
   case GL_RGB:
-    if (tex->get_num_components() == 1) {
-      new_image = PTA_uchar::empty_array(orig_image_size * 3);
+#ifndef OPENGLES
+  case GL_BGR_INTEGER:
+#endif
+    // Need to swap order.
+    if (num_components == 1 && component_width == 1) {
       uchar_l_to_rgb(new_image, orig_image, orig_image_size);
-      result = new_image;
-      break;
+      return;
     }
-    switch (tex->get_component_type()) {
-    case Texture::T_unsigned_byte:
-    case Texture::T_byte:
-      new_image = PTA_uchar::empty_array(orig_image_size);
+    if (num_components == 3 && component_width == 1) {
       uchar_bgr_to_rgb(new_image, orig_image, orig_image_size / 3);
-      result = new_image;
-      break;
-
-    case Texture::T_unsigned_short:
-    case Texture::T_short:
-      new_image = PTA_uchar::empty_array(orig_image_size);
-      ushort_bgr_to_rgb((unsigned short *)new_image.p(),
+      return;
+    }
+    if (num_components == 3 && component_width == 2) {
+      ushort_bgr_to_rgb((unsigned short *)new_image,
                         (const unsigned short *)orig_image,
                         orig_image_size / 6);
-      result = new_image;
-      break;
+      return;
+    }
+    break;
 
-    default:
-      break;
+  case GL_BGRA:
+#ifndef OPENGLES_1
+  case GL_RGBA_INTEGER:
+#endif
+    if (num_components == 4) {
+      memcpy(new_image, orig_image, orig_image_size);
+      return;
+    }
+    if (num_components == 1 && component_width == 1) {
+      uchar_l_to_rgba(new_image, orig_image, orig_image_size);
+      return;
+    }
+    if (num_components == 2 && component_width == 1) {
+      uchar_la_to_rgba(new_image, orig_image, orig_image_size / 2);
+      return;
     }
     break;
 
   case GL_RGBA:
-    if (tex->get_num_components() == 2) {
-      new_image = PTA_uchar::empty_array(orig_image_size * 2);
+#ifndef OPENGLES
+  case GL_BGRA_INTEGER:
+#endif
+    // Need to swap order.
+    if (num_components == 1 && component_width == 1) {
+      uchar_l_to_rgba(new_image, orig_image, orig_image_size);
+      return;
+    }
+    if (num_components == 2 && component_width == 1) {
       uchar_la_to_rgba(new_image, orig_image, orig_image_size / 2);
-      result = new_image;
-      break;
+      return;
     }
-    switch (tex->get_component_type()) {
-    case Texture::T_unsigned_byte:
-    case Texture::T_byte:
-      new_image = PTA_uchar::empty_array(orig_image_size);
+    if (num_components == 4 && component_width == 1) {
       uchar_bgra_to_rgba(new_image, orig_image, orig_image_size / 4);
-      result = new_image;
-      break;
-
-    case Texture::T_unsigned_short:
-    case Texture::T_short:
-      new_image = PTA_uchar::empty_array(orig_image_size);
-      ushort_bgra_to_rgba((unsigned short *)new_image.p(),
+      return;
+    }
+    if (num_components == 4 && component_width == 2) {
+      ushort_bgra_to_rgba((unsigned short *)new_image,
                           (const unsigned short *)orig_image,
                           orig_image_size / 8);
-      result = new_image;
-      break;
-
-    default:
-      break;
+      return;
     }
     break;
 
@@ -505,7 +625,33 @@ fix_component_ordering(PTA_uchar &new_image,
     break;
   }
 
-  return result;
+  nassert_raise("Failed to convert image.");
+}
+
+/**
+ * Reverses the order of the components within the image, to convert (for
+ * instance) GL_BGR to GL_RGB. Returns the byte pointer representing the
+ * converted image, or the original image if it is unchanged.
+ *
+ * new_image must be supplied; it is the PTA_uchar that will be used to hold
+ * the converted image if required.  It will be modified only if the
+ * conversion is necessary, in which case the data will be stored there, and
+ * this pointer will be returned.  If the conversion is not necessary, this
+ * pointer will be left unchanged.
+ */
+static const unsigned char *
+fix_component_ordering(PTA_uchar &new_image,
+                       const unsigned char *orig_image, size_t orig_image_size,
+                       GLenum external_format, Texture *tex) {
+  if (external_format == GL_RGB || external_format == GL_RGBA) {
+    int num_components = tex->get_num_components();
+    int component_width = tex->get_component_width();
+    size_t new_image_size = (orig_image_size / num_components) * ((external_format == GL_RGBA) ? 4 : 3);
+    new_image = PTA_uchar::empty_array(new_image_size);
+    copy_image(&new_image[0], orig_image, orig_image_size, external_format, num_components, component_width);
+    return new_image;
+  }
+  return orig_image;
 }
 
 // #--- Zhao Nov2011
@@ -524,6 +670,7 @@ int CLP(GraphicsStateGuardian)::get_driver_shader_version_minor() { return _gl_s
 CLP(GraphicsStateGuardian)::
 CLP(GraphicsStateGuardian)(GraphicsEngine *engine, GraphicsPipe *pipe) :
   GraphicsStateGuardian(gl_coordinate_system, engine, pipe),
+  _job_queue_cvar(_job_queue_mutex),
   _renderbuffer_residency(get_prepared_objects()->get_name(), "renderbuffer"),
   _active_ppbuffer_memory_pcollector("Graphics memory:" + get_prepared_objects()->get_name() + ":Active:ppbuffer"),
   _inactive_ppbuffer_memory_pcollector("Graphics memory:" + get_prepared_objects()->get_name() + ":Inactive:ppbuffer")
@@ -567,6 +714,13 @@ CLP(GraphicsStateGuardian)(GraphicsEngine *engine, GraphicsPipe *pipe) :
   _cg_context = 0;
 #endif
 
+#ifdef HAVE_THREADS
+  AsyncTaskManager *task_mgr = AsyncTaskManager::get_global_ptr();
+  _async_chain = task_mgr->make_task_chain("gl_texture_transfer",
+                                           gl_texture_transfer_num_threads,
+                                           gl_texture_transfer_thread_priority);
+#endif
+
 #ifdef DO_PSTATS
   if (gl_finish) {
     GLCAT.warning()
@@ -585,6 +739,15 @@ CLP(GraphicsStateGuardian)::
       << "GLGraphicsStateGuardian " << this << " destructing\n";
   }
 
+#ifdef HAVE_THREADS
+  // Make sure there are no more async tasks that could reference this GSG.
+  _async_chain->wait_for_tasks();
+#endif
+  {
+    MutexHolder holder(_job_queue_mutex);
+    _job_queue.clear();
+  }
+
   close_gsg();
 }
 
@@ -1731,6 +1894,24 @@ reset() {
   }
 #endif
 
+#ifndef OPENGLES_1
+  _glCopyBufferSubData = nullptr;
+  if (_supports_buffers) {
+    if (is_at_least_gl_version(3, 1) ||
+        is_at_least_gles_version(3, 0) ||
+        has_extension("GL_ARB_copy_buffer")) {
+      _glCopyBufferSubData = (PFNGLCOPYBUFFERSUBDATAPROC)
+        get_extension_func("glCopyBufferSubData");
+    }
+#ifdef OPENGLES_2
+    else if (has_extension("GL_NV_copy_buffer")) {
+      _glCopyBufferSubData = (PFNGLCOPYBUFFERSUBDATAPROC)
+        get_extension_func("glCopyBufferSubDataNV");
+    }
+#endif
+  }
+#endif
+
 #ifdef OPENGLES
   if (is_at_least_gles_version(3, 0)) {
     _glMapBufferRange = (PFNGLMAPBUFFERRANGEEXTPROC)
@@ -1757,7 +1938,8 @@ reset() {
     _glMapBufferRange = nullptr;
   }
 
-  if (is_at_least_gl_version(4, 4) || has_extension("GL_ARB_buffer_storage")) {
+  if (_glMapBufferRange != nullptr &&
+      (is_at_least_gl_version(4, 4) || has_extension("GL_ARB_buffer_storage"))) {
     _glBufferStorage = (PFNGLBUFFERSTORAGEPROC)
       get_extension_func("glBufferStorage");
 
@@ -2593,7 +2775,10 @@ reset() {
 #endif
 
 #ifndef OPENGLES
-  if (is_at_least_gl_version(4, 5) || has_extension("GL_ARB_direct_state_access")) {
+  _glMapNamedBufferRange = nullptr;
+
+  if (gl_support_dsa &&
+      (is_at_least_gl_version(4, 5) || has_extension("GL_ARB_direct_state_access"))) {
     _glCreateTextures = (PFNGLCREATETEXTURESPROC)
       get_extension_func("glCreateTextures");
     _glTextureStorage2D = (PFNGLTEXTURESTORAGE2DPROC)
@@ -2607,12 +2792,29 @@ reset() {
     _glBindTextureUnit = (PFNGLBINDTEXTUREUNITPROC)
       get_extension_func("glBindTextureUnit");
 
+    if (_glMapBufferRange != nullptr) {
+      _glMapNamedBufferRange = (PFNGLMAPNAMEDBUFFERRANGEPROC)
+        get_extension_func("glMapNamedBufferRange");
+    }
+
     _supports_dsa = true;
   } else {
     _supports_dsa = false;
   }
 #endif
 
+#ifndef OPENGLES_1
+#ifdef OPENGLES
+  if (is_at_least_gles_version(3, 0) || has_extension("GL_NV_pixel_buffer_object")) {
+#else
+  if (is_at_least_gl_version(2, 1) || has_extension("GL_ARB_pixel_buffer_object")) {
+#endif
+    _supports_pixel_buffers = true;
+  } else {
+    _supports_pixel_buffers = false;
+  }
+#endif
+
 #ifndef OPENGLES_1
   // Do we support empty framebuffer objects?
 #ifdef OPENGLES
@@ -3207,7 +3409,7 @@ reset() {
   _max_image_units = 0;
 #ifndef OPENGLES_1
 #ifdef OPENGLES
-  if (is_at_least_gles_version(3, 1) && gl_immutable_texture_storage) {
+  if (is_at_least_gles_version(3, 1)) {
 #else
   if (is_at_least_gl_version(4, 2) || has_extension("GL_ARB_shader_image_load_store")) {
 #endif
@@ -4279,6 +4481,8 @@ prepare_lens() {
  */
 bool CLP(GraphicsStateGuardian)::
 begin_frame(Thread *current_thread) {
+  process_pending_jobs(false);
+
   if (!GraphicsStateGuardian::begin_frame(current_thread)) {
     return false;
   }
@@ -4292,9 +4496,11 @@ begin_frame(Thread *current_thread) {
   _primitive_batches_display_list_pcollector.clear_level();
 #endif
 
-  if (!_async_ram_copies.empty()) {
-    finish_async_framebuffer_ram_copies();
+#ifndef OPENGLES_1
+  if (!_fences.empty()) {
+    process_fences(false);
   }
+#endif
 
 #if defined(DO_PSTATS) && !defined(OPENGLES)
   int frame_number = ClockObject::get_global_clock()->get_frame_count(current_thread);
@@ -6252,25 +6458,24 @@ issue_memory_barrier(GLbitfield barriers) {
 
   _glMemoryBarrier(barriers);
 
-  // Indicate that barriers no longer need to be issued for the relevant lists
-  // of textures.
+  // Increment these counters to indicate that these barriers have been issued.
   if (barriers & GL_TEXTURE_FETCH_BARRIER_BIT) {
-    _textures_needing_fetch_barrier.clear();
+    ++_texture_fetch_barrier_counter;
     GLCAT.spam(false) << " texture_fetch";
   }
 
   if (barriers & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT) {
-    _textures_needing_image_access_barrier.clear();
+    ++_shader_image_access_barrier_counter;
     GLCAT.spam(false) << " shader_image_access";
   }
 
   if (barriers & GL_TEXTURE_UPDATE_BARRIER_BIT) {
-    _textures_needing_update_barrier.clear();
+    ++_texture_update_barrier_counter;
     GLCAT.spam(false) << " texture_update";
   }
 
   if (barriers & GL_FRAMEBUFFER_BARRIER_BIT) {
-    _textures_needing_framebuffer_barrier.clear();
+    ++_framebuffer_barrier_counter;
     GLCAT.spam(false) << " framebuffer";
   }
 
@@ -6370,23 +6575,16 @@ prepare_texture(Texture *tex) {
  * (and if get_incomplete_render() is true).
  */
 bool CLP(GraphicsStateGuardian)::
-update_texture(TextureContext *tc, bool force) {
+update_texture(TextureContext *tc, bool force, CompletionToken token) {
   CLP(TextureContext) *gtc;
   DCAST_INTO_R(gtc, tc, false);
 
-  Texture *tex = tc->get_texture();
-  GLenum target = get_texture_target(tex->get_texture_type());
-  if (gtc->_target != target) {
-    // The target has changed.  That means we have to re-bind a new texture
-    // object.
-    gtc->reset_data(target, tex->get_num_views());
-  }
-
   if (gtc->was_image_modified() || !gtc->_has_storage) {
     PStatGPUTimer timer(this, _texture_update_pcollector);
 
     // If the texture image was modified, reload the texture.
-    bool okflag = upload_texture(gtc, force, tex->uses_mipmaps());
+    Texture *tex = tc->get_texture();
+    bool okflag = upload_texture(gtc, force, tex->uses_mipmaps(), std::move(token));
     if (!okflag) {
       GLCAT.error()
         << "Could not load " << *tex << "\n";
@@ -6402,6 +6600,7 @@ update_texture(TextureContext *tc, bool force) {
   }
   else if (gtc->was_properties_modified()) {
     PStatGPUTimer timer(this, _texture_update_pcollector);
+    Texture *tex = tc->get_texture();
 
     // If only the properties have been modified, we don't necessarily need to
     // reload the texture.
@@ -6417,7 +6616,7 @@ update_texture(TextureContext *tc, bool force) {
 
     if (needs_reload) {
       gtc->mark_needs_reload();
-      bool okflag = upload_texture(gtc, force, tex->uses_mipmaps());
+      bool okflag = upload_texture(gtc, force, tex->uses_mipmaps(), std::move(token));
       if (!okflag) {
         GLCAT.error()
           << "Could not load " << *tex << "\n";
@@ -6427,7 +6626,20 @@ update_texture(TextureContext *tc, bool force) {
     else {
       // The texture didn't need reloading, but mark it fully updated now.
       gtc->mark_loaded();
+
+      if (force) {
+        // This update is still underway.
+        gtc->wait_pending_uploads();
+      }
+      token.complete(true);
+    }
+  }
+  else {
+    if (force) {
+      // This update is still underway.
+      gtc->wait_pending_uploads();
     }
+    token.complete(true);
   }
 
   gtc->enqueue_lru(&_prepared_objects->_graphics_memory_lru);
@@ -6445,12 +6657,9 @@ void CLP(GraphicsStateGuardian)::
 release_texture(TextureContext *tc) {
   CLP(TextureContext) *gtc = DCAST(CLP(TextureContext), tc);
 
-#ifndef OPENGLES_1
-  _textures_needing_fetch_barrier.erase(gtc);
-  _textures_needing_image_access_barrier.erase(gtc);
-  _textures_needing_update_barrier.erase(gtc);
-  _textures_needing_framebuffer_barrier.erase(gtc);
-#endif
+  gtc->cancel_pending_uploads();
+  gtc->wait_pending_uploads();
+  gtc->delete_unused_pbos();
 
   gtc->set_num_views(0);
   delete gtc;
@@ -6473,13 +6682,6 @@ release_textures(const pvector<TextureContext *> &contexts) {
   for (TextureContext *tc : contexts) {
     CLP(TextureContext) *gtc = DCAST(CLP(TextureContext), tc);
 
-#ifndef OPENGLES_1
-    _textures_needing_fetch_barrier.erase(gtc);
-    _textures_needing_image_access_barrier.erase(gtc);
-    _textures_needing_update_barrier.erase(gtc);
-    _textures_needing_framebuffer_barrier.erase(gtc);
-#endif
-
     num_indices += gtc->_num_views;
     if (gtc->_buffers != nullptr) {
       num_buffers += gtc->_num_views;
@@ -7516,7 +7718,6 @@ release_shader_buffers(const pvector<BufferContext *> &contexts) {
  */
 bool CLP(GraphicsStateGuardian)::
 extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data) {
-  GLuint index = 0;
   BufferContext *bc = buffer->prepare_now(get_prepared_objects(), this);
   if (bc == nullptr || !bc->is_of_type(CLP(BufferContext)::get_class_type())) {
     return false;
@@ -7525,6 +7726,10 @@ extract_shader_buffer_data(ShaderBuffer *buffer, vector_uchar &data) {
 
   data.resize(buffer->get_data_size_bytes());
 
+  if (_glMemoryBarrier != nullptr) {
+    _glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT);
+  }
+
   _glBindBuffer(GL_SHADER_STORAGE_BUFFER, gbc->_index);
 
   _glGetBufferSubData(GL_SHADER_STORAGE_BUFFER, 0, data.size(), &data[0]);
@@ -7805,6 +8010,8 @@ framebuffer_copy_to_texture(Texture *tex, int view, int z,
   nassertr(tc != nullptr, false);
   CLP(TextureContext) *gtc = DCAST(CLP(TextureContext), tc);
 
+  gtc->cancel_pending_uploads();
+
   GLenum target = get_texture_target(tex->get_texture_type());
   if (gtc->_target != target) {
     gtc->reset_data(target, view + 1);
@@ -7884,8 +8091,8 @@ framebuffer_copy_to_texture(Texture *tex, int view, int z,
   }
 
 #ifndef OPENGLES_1
-  if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT)) {
-    // Make sure that any incoherent writes to this texture have been synced.
+  if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT, true)) {
+    // Make sure that any reads and writes to this texture have been synced.
     issue_memory_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
   }
 #endif
@@ -8275,9 +8482,46 @@ framebuffer_copy_to_ram(Texture *tex, int view, int z,
       _glMemoryBarrier(GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
     }
 #endif
-    GLsync fence = _glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
-    _async_ram_copies.push_back({request, pbo, fence, external_format,
-                                 view, mapped_ptr, image_size});
+
+    insert_fence([
+        this, request = PT(ScreenshotRequest)(request),
+        mapped_ptr, size = image_size,
+        pbo, view, external_format
+      ] (bool success) {
+
+      void *ptr = mapped_ptr;
+      if (ptr == nullptr) {
+        ptr = map_read_buffer(GL_PIXEL_PACK_BUFFER, pbo, size);
+      }
+
+      // Do the memcpy in the background, since it can be slow.
+      auto func = [=](AsyncTask *task) {
+        const unsigned char *result = (unsigned char *)ptr;
+        PTA_uchar new_image;
+        if (external_format == GL_RGBA || external_format == GL_RGB) {
+          // We may have to reverse the byte ordering of the image if GL didn't do
+          // it for us.
+          result = fix_component_ordering(new_image, result, size,
+                                          external_format, request->get_result());
+        }
+        request->set_view_data(view, result);
+
+        // Finishing can take a long time, release the client buffer first so it
+        // can be reused for the next screenshot.
+        this->release_client_buffer(pbo, ptr, size);
+        request->finish();
+        return AsyncTask::DS_done;
+      };
+#ifdef HAVE_THREADS
+      // We assign a sort value based on the originating frame number, so that
+      // earlier frames will be processed before subsequent frames, but we don't
+      // make it unique for every frame, which would kill concurrency.
+      int frame_number = request->get_frame_number();
+      _async_chain->add(std::move(func), "screenshot", frame_number >> 3, -(frame_number & ((1 << 3) - 1)));
+#else
+      func(nullptr);
+#endif
+    });
   } else
 #endif
   if (external_format == GL_RGBA || external_format == GL_RGB) {
@@ -8302,104 +8546,6 @@ framebuffer_copy_to_ram(Texture *tex, int view, int z,
   return true;
 }
 
-/**
- * Finishes all asynchronous framebuffer-copy-to-ram operations.
- */
-void CLP(GraphicsStateGuardian)::
-finish_async_framebuffer_ram_copies(bool force) {
-#ifndef OPENGLES_1
-  if (_async_ram_copies.empty()) {
-    return;
-  }
-
-  //XXX having a fixed number of threads is not a great idea.  We ought to have
-  // a common thread pool that is sized based on the available number of CPUs.
-#ifdef HAVE_THREADS
-  AsyncTaskManager *task_mgr = AsyncTaskManager::get_global_ptr();
-  static AsyncTaskChain *chain = task_mgr->make_task_chain("texture_download", 2, TP_low);
-#endif
-
-  PStatTimer timer(_copy_texture_finish_pcollector);
-
-  if (force) {
-    // Just wait for the last fence, the rest must be complete too then.
-    PStatTimer timer(_wait_fence_pcollector);
-    GLsync fence = _async_ram_copies.back()._fence;
-    _glClientWaitSync(fence, 0, (GLuint64)-1);
-  }
-
-  while (!_async_ram_copies.empty()) {
-    AsyncRamCopy &copy = _async_ram_copies.front();
-    if (!force) {
-      GLenum result = _glClientWaitSync(copy._fence, 0, 0);
-      if (result != GL_ALREADY_SIGNALED && result != GL_CONDITION_SATISFIED) {
-        // Not yet done.  The rest must not yet be done then, either.
-        break;
-      }
-    }
-    _glDeleteSync(copy._fence);
-
-    GLuint pbo = copy._pbo;
-    int view = copy._view;
-    PT(ScreenshotRequest) request = std::move(copy._request);
-    GLuint external_format = copy._external_format;
-    void *mapped_ptr = copy._mapped_pointer;
-    size_t size = copy._size;
-
-    if (mapped_ptr == nullptr) {
-      _glBindBuffer(GL_PIXEL_PACK_BUFFER, pbo);
-#ifdef OPENGLES
-      // There is neither glMapBuffer nor persistent mapping in OpenGL ES
-      mapped_ptr = _glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, size, GL_MAP_READ_BIT);
-#else
-      // If we get here in desktop GL, we must not have persistent mapping
-      nassertv(!_supports_buffer_storage);
-      mapped_ptr = _glMapBuffer(GL_PIXEL_PACK_BUFFER, GL_READ_ONLY);
-#endif
-    }
-
-    // Do the memcpy in the background, since it can be slow.
-    auto func = [=](AsyncTask *task) {
-      const unsigned char *result = (unsigned char *)mapped_ptr;
-      PTA_uchar new_image;
-      if (external_format == GL_RGBA || external_format == GL_RGB) {
-        // We may have to reverse the byte ordering of the image if GL didn't do
-        // it for us.
-        result = fix_component_ordering(new_image, result, size,
-                                        external_format, request->get_result());
-      }
-      request->set_view_data(view, result);
-
-      // Finishing can take a long time, release the client buffer first so it
-      // can be reused for the next screenshot.
-      this->release_client_buffer(pbo, mapped_ptr, size);
-      request->finish();
-      return AsyncTask::DS_done;
-    };
-#ifdef HAVE_THREADS
-    // We assign a sort value based on the originating frame number, so that
-    // earlier frames will be processed before subsequent frames, but we don't
-    // make it unique for every frame, which would kill concurrency.
-    int frame_number = request->get_frame_number();
-    chain->add(std::move(func), "screenshot", frame_number >> 3, -(frame_number & ((1 << 3) - 1)));
-#else
-    func(nullptr);
-#endif
-
-    _async_ram_copies.pop_front();
-
-    // If there is 1 remaining, save it for next frame.  This helps prevent an
-    // inconsistent frame rate when the number of fetched frames alternates
-    // between 0 and 2, which can settle into a stable feedback loop.
-    if (!force && _async_ram_copies.size() == 1) {
-      break;
-    }
-  }
-
-  _glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
-#endif
-}
-
 #ifdef SUPPORT_FIXED_FUNCTION
 /**
  *
@@ -13870,12 +14016,15 @@ apply_sampler(GLuint unit, const SamplerState &sampler, CLP(TextureContext) *gtc
  * image.
  */
 bool CLP(GraphicsStateGuardian)::
-upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
+upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps,
+               CompletionToken token) {
   PStatGPUTimer timer(this, _load_texture_pcollector);
 
   Texture *tex = gtc->get_texture();
 
-  if (_effective_incomplete_render && !force) {
+  //FIXME: upload simple texture for async uploaded thing
+  bool async_upload = true;
+  if (_effective_incomplete_render && !force && !async_upload) {
     bool has_image = _supports_compressed_texture ? tex->has_ram_image() : tex->has_uncompressed_ram_image();
     if (!has_image && tex->might_have_ram_image() &&
         tex->has_simple_ram_image() &&
@@ -14159,6 +14308,8 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
 
   int num_views = tex->get_num_views();
   if (needs_reload) {
+    gtc->cancel_pending_uploads();
+
     if (gtc->_immutable) {
       GLCAT.info()
         << "Attempt to modify texture with immutable storage, recreating texture.\n";
@@ -14172,8 +14323,8 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
 
 #ifndef OPENGLES_1
   if (needs_reload || !image.is_null()) {
-    // Make sure that any incoherent writes to this texture have been synced.
-    if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT)) {
+    // Make sure that any reads and writes to this texture have been synced.
+    if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT, true)) {
       issue_memory_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
     }
   }
@@ -14190,46 +14341,73 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
     nassertr(gtc->_buffers != nullptr, false);
   }
 
-  bool extract_success = false;
-  if (tex->get_post_load_store_cache()) {
-    extract_success = true;
+  bool compressed = image_compression != Texture::CM_off;
+  if (compressed && !_supports_compressed_texture) {
+    return false;
   }
 
-  bool success = true;
-  for (int view = 0; view < num_views; ++view) {
-    if (upload_texture_image(gtc, view, needs_reload || view >= old_num_views,
-                             mipmap_bias, num_levels,
-                             internal_format, external_format,
-                             component_type, image_compression)) {
-      gtc->_has_storage = true;
+  // Does this texture require asynchronous uploads?
+#if defined(HAVE_THREADS) && !defined(OPENGLES_1)
+  int async_buffers = _supports_pixel_buffers ? tex->get_num_async_transfer_buffers() : 0;
+  if (async_buffers != 0) {
+    // Prefer immutable storage, if supported.
+    if (needs_reload && _supports_tex_storage) {
+      gtc->_immutable = true;
+    }
+  }
+  else if (async_buffers == 0 && gtc->_num_pbos > 0) {
+    gtc->delete_unused_pbos();
+  }
+#else
+  int async_buffers = 0;
+#endif
+
+  // Keep track of which views are uploaded.
+  CompletionCounter counter;
+  bool success = true;
+
+  for (int view = 0; view < num_views; ++view) {
+    if (upload_texture_view(gtc, view, needs_reload || view >= old_num_views,
+                            mipmap_bias, num_levels,
+                            internal_format, external_format,
+                            component_type, compressed, async_buffers,
+                            counter.make_token())) {
+      // We always create storage right away even if we do the upload of the
+      // actual data asynchronously.
+      gtc->_has_storage = true;
       gtc->_internal_format = internal_format;
       gtc->_width = width;
       gtc->_height = height;
       gtc->_depth = depth;
       gtc->_num_levels = num_levels;
-
-      if (extract_success) {
-        // The next call assumes the texture is still bound.
-        if (!do_extract_texture_data(gtc, view)) {
-          extract_success = false;
-        }
-      }
     }
     else {
       success = false;
     }
   }
 
-  report_my_gl_errors();
+  if (!success) {
+    report_my_gl_errors();
+    return false;
+  }
+
+  gtc->_uploads_pending++;
+
+  std::move(counter).then([=, token = std::move(token)] (bool success) mutable {
+    --gtc->_uploads_pending;
+    if (!success) {
+      token.complete(false);
+      return;
+    }
 
-  if (success) {
     if (needs_reload) {
       gtc->update_data_size_bytes(get_texture_memory_size(gtc));
     }
 
-    nassertr(gtc->_has_storage, false);
+    nassertv(gtc->_has_storage);
 
-    if (extract_success) {
+    Texture *tex = gtc->get_texture();
+    if (tex->get_post_load_store_cache()) {
       tex->set_post_load_store_cache(false);
       // OK, get the RAM image, and save it in a BamCache record.
       if (tex->has_ram_image()) {
@@ -14243,14 +14421,20 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
     }
 
     GraphicsEngine *engine = get_engine();
-    nassertr(engine != nullptr, false);
+    nassertv(engine != nullptr);
     engine->texture_uploaded(tex);
-    gtc->mark_loaded();
 
-    return true;
-  }
+    token.complete(true);
+  });
 
-  return false;
+  // Update the modified counters now, even if we've only spawned an async
+  // upload, because we've already set things in motion to update the texture
+  // to this version.  Otherwise, future calls to update_texture will continue
+  // to try to update the image over and over again.
+  gtc->mark_loaded();
+
+  report_my_gl_errors();
+  return true;
 }
 
 /**
@@ -14258,17 +14442,13 @@ upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps) {
  * texture memory.
  */
 bool CLP(GraphicsStateGuardian)::
-upload_texture_image(CLP(TextureContext) *gtc, int view, bool needs_reload,
-                     int mipmap_bias, int num_levels, GLint internal_format,
-                     GLint external_format, GLenum component_type,
-                     Texture::CompressionMode image_compression) {
+upload_texture_view(CLP(TextureContext) *gtc, int view, bool needs_reload,
+                    int mipmap_bias, int num_levels, GLint internal_format,
+                    GLint external_format, GLenum component_type,
+                    bool compressed, int async_buffers, CompletionToken token) {
   // Make sure the error stack is cleared out before we begin.
   clear_my_gl_errors();
 
-  if (image_compression != Texture::CM_off && !_supports_compressed_texture) {
-    return false;
-  }
-
   GLenum target = gtc->_target;
   if (target == GL_NONE) {
     // Unsupported target (e.g.  3-d texturing on GL 1.1).
@@ -14305,9 +14485,14 @@ upload_texture_image(CLP(TextureContext) *gtc, int view, bool needs_reload,
   int width = tex->get_expected_mipmap_x_size(mipmap_bias);
   int height = tex->get_expected_mipmap_y_size(mipmap_bias);
   int depth = tex->get_expected_mipmap_z_size(mipmap_bias);
+  int num_components = tex->get_num_components();
+  int expected_num_components = compressed ? num_components : get_external_format_components(external_format);
+  int component_width = tex->get_component_width();
+  GLenum usage = GL_STATIC_DRAW;
 
 #ifndef OPENGLES_1
   if (target == GL_TEXTURE_BUFFER) {
+    usage = get_usage(tex->get_usage_hint());
     GLuint buffer = gtc->get_view_buffer(view);
     nassertr(buffer != 0, false);
     _glBindBuffer(GL_TEXTURE_BUFFER, buffer);
@@ -14375,6 +14560,7 @@ upload_texture_image(CLP(TextureContext) *gtc, int view, bool needs_reload,
     // but we are not allowed to change the texture size or number of mipmap
     // levels after this point.
     if (gtc->_immutable) {
+      PStatTimer timer(_create_texture_storage_pcollector);
       if (GLCAT.is_debug()) {
         GLCAT.debug()
           << "allocating storage for texture " << tex->get_name() << ", "
@@ -14409,15 +14595,84 @@ upload_texture_image(CLP(TextureContext) *gtc, int view, bool needs_reload,
 
   // How many mipmap levels do we have available to upload?
   int num_ram_mipmap_levels = 0;
+  GLuint pbo = 0;
+  size_t pbo_size = 0;
+  void *mapped_ptr = nullptr;
   if (!image.is_null()) {
     num_ram_mipmap_levels = std::min(num_levels, tex->get_num_ram_mipmap_images() - mipmap_bias);
+
+    // Create a PBO that can hold all the mipmap levels.
+#if defined(HAVE_THREADS) && !defined(OPENGLES_1)
+    if (async_buffers != 0) {
+      pbo_size = 0;
+      for (int n = mipmap_bias; n < num_ram_mipmap_levels + mipmap_bias; ++n) {
+        size_t view_size = tex->get_ram_mipmap_view_size(n);
+        if (!compressed) {
+          view_size = expected_num_components * (view_size / num_components);
+        }
+        pbo_size += view_size;
+      }
+
+      bool create_storage = false;
+      if (pbo_size != gtc->_pbo_size) {
+        // No PBOs yet, or they aren't of the right size.
+        if (_supports_buffer_storage) {
+          // If using buffer storage, need to deallocate all of them.
+          gtc->delete_unused_pbos();
+        }
+        gtc->_pbo_size = pbo_size;
+        create_storage = true;
+      }
+      else if (async_buffers > 0) {
+        // Wait for a PBO to become available if we're at our limit.
+        gtc->wait_for_unused_pbo(async_buffers);
+      }
+
+      PStatTimer timer(_create_map_pbo_pcollector);
+      if (gtc->_unused_pbos.empty()) {
+        _glGenBuffers(1, &pbo);
+        gtc->_num_pbos++;
+        create_storage = true;
+      } else {
+        // Map an existing PBO.
+        pbo = gtc->_unused_pbos.back();
+        gtc->_unused_pbos.pop_back();
+      }
+
+      mapped_ptr = map_write_discard_buffer(GL_PIXEL_UNPACK_BUFFER, pbo,
+                                            pbo_size, create_storage);
+      if (mapped_ptr == nullptr) {
+        report_my_gl_errors();
+        GLCAT.warning()
+          << "Failed to map pixel unpack buffer.\n";
+        gtc->_unused_pbos.push_back(pbo);
+      }
+    }
+#endif
   }
 
-  if (!needs_reload) {
-    // Try to subload the image over the existing GL Texture object, possibly
-    // saving on texture memory fragmentation.
+  if (needs_reload && num_ram_mipmap_levels == 0 &&
+      external_format == GL_DEPTH_STENCIL && get_supports_depth_stencil()) {
+#ifdef OPENGLES
+    component_type = GL_UNSIGNED_INT_24_8_OES;
+#else
+    component_type = GL_UNSIGNED_INT_24_8_EXT;
+#endif
+  }
 
-    if (GLCAT.is_debug()) {
+  int upload_count = ++gtc->_uploads_started;
+
+  if (GLCAT.is_debug()) {
+    if (needs_reload) {
+      // Load the image up from scratch, creating a new GL Texture object.
+      GLCAT.debug()
+        << "loading new texture object for " << tex->get_name() << " view "
+        << view << ", " << width << " x " << height << " x " << depth
+        << ", mipmaps " << num_ram_mipmap_levels << " / " << num_levels;
+    }
+    else {
+      // Try to subload the image over the existing GL Texture object, possibly
+      // saving on texture memory fragmentation.
       SparseArray pages = gtc->get_view_modified_pages(view, 0);
       if (num_ram_mipmap_levels == 0) {
         if (tex->has_clear_color()) {
@@ -14425,457 +14680,622 @@ upload_texture_image(CLP(TextureContext) *gtc, int view, bool needs_reload,
             << "clearing texture " << tex->get_name() << " view " << view << ", "
             << width << " x " << height << " x " << depth << ", pages " << pages
             << ", mipmaps " << num_levels << ", clear_color = "
-            << tex->get_clear_color() << "\n";
+            << tex->get_clear_color();
         } else {
           GLCAT.debug()
             << "not loading NULL image for texture " << tex->get_name()
             << " view " << view << ", " << width << " x " << height << " x " << depth
-            << ", pages " << pages << ", mipmaps = " << num_levels << "\n";
+            << ", pages " << pages << ", mipmaps = " << num_levels;
         }
       } else {
         GLCAT.debug()
           << "updating image data of texture " << tex->get_name() << " view "
           << view << ", " << width << " x " << height << " x " << depth
           << ", pages " << pages << ", mipmaps " << num_ram_mipmap_levels
-          << " / " << num_levels << "\n";
-      }
-    }
-
-    for (int n = mipmap_bias; n < num_levels + mipmap_bias; ++n) {
-      SparseArray pages = gtc->get_view_modified_pages(view, n);
-
-      // we grab the mipmap pointer first, if it is NULL we grab the normal
-      // mipmap image pointer which is a PTA_uchar
-      const unsigned char *image_ptr = (unsigned char*)tex->get_ram_mipmap_pointer(n);
-      CPTA_uchar ptimage;
-      if (image_ptr == nullptr) {
-        ptimage = tex->get_ram_mipmap_image(n);
-        if (ptimage.is_null()) {
-          if (n - mipmap_bias < num_ram_mipmap_levels) {
-            // We were told we'd have this many RAM mipmap images, but we
-            // don't.  Raise a warning.
-            GLCAT.warning()
-              << "No mipmap level " << n << " defined for " << tex->get_name()
-              << "\n";
-            break;
-          }
-
-          if (tex->has_clear_color()) {
-            // The texture has a clear color, so we should fill this mipmap
-            // level to a solid color.
-#ifndef OPENGLES
-            if (target != GL_TEXTURE_BUFFER) {
-              if (_supports_clear_texture) {
-                // We can do that with the convenient glClearTexImage
-                // function.
-                vector_uchar clear_data = tex->get_clear_data();
+          << " / " << num_levels;
+      }
+    }
+    if (mapped_ptr != nullptr) {
+      GLCAT.debug(false) << " (async #" << upload_count << " via buffer " << pbo << ")\n";
+    } else {
+      GLCAT.debug(false) << " (#" << upload_count << ")\n";
+    }
+  }
 
-                if (pages.has_all_of(0, depth)) {
-                  _glClearTexImage(index, n - mipmap_bias, external_format,
-                                   component_type, (void *)&clear_data[0]);
-                }
-                else for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
-                  int begin = pages.get_subrange_begin(sri);
-                  int num_pages = pages.get_subrange_end(sri) - begin;
-                  _glClearTexSubImage(index, n - mipmap_bias, 0, 0, begin,
-                                      width, height, num_pages, external_format,
-                                      component_type, (void *)&clear_data[0]);
-                }
-                continue;
-              }
-            } else {
-              if (_supports_clear_buffer) {
-                // For buffer textures we need to clear the underlying
-                // storage.
-                vector_uchar clear_data = tex->get_clear_data();
-
-                _glClearBufferData(GL_TEXTURE_BUFFER, internal_format, external_format,
-                                   component_type, (const void *)&clear_data[0]);
-                continue;
-              }
-            }
-#endif  // OPENGLES
-            // Ask the Texture class to create the mipmap level in RAM. It'll
-            // fill it in with the correct clear color, which we can then
-            // upload.
-            ptimage = tex->make_ram_mipmap_image(n);
+  // Keeps track of any async jobs we've spawned.
+#if defined(HAVE_THREADS) && !defined(OPENGLES_1)
+  CompletionCounter counter;
+  struct AsyncLevel {
+    int width, height, depth;
+    size_t page_size;
+    uintptr_t pbo_offset;
+    SparseArray pages;
+  };
+  AsyncLevel *async_levels = nullptr;
+  size_t num_async_levels = 0;
+  uintptr_t pbo_offset = 0u;
+#endif
+  bool success = true;
 
-          } else {
-            // No clear color and no more images.
-            break;
-          }
-        }
-        image_ptr = ptimage;
-      }
+  for (int n = mipmap_bias; n < num_levels + mipmap_bias; ++n) {
+    SparseArray pages = gtc->get_view_modified_pages(view, n);
+    int level = n - mipmap_bias;
+
+    int width = tex->get_expected_mipmap_x_size(n);
+    int height = tex->get_expected_mipmap_y_size(n);
+
+    // we grab the mipmap pointer first, if it is NULL we grab the normal
+    // mipmap image pointer which is a PTA_uchar
+    const unsigned char *image_ptr = (unsigned char*)tex->get_ram_mipmap_pointer(n);
+    CPTA_uchar ptimage;
+    if (image_ptr == nullptr) {
+      ptimage = tex->get_ram_mipmap_image(n);
+      image_ptr = ptimage;
+    }
+    if (image_ptr == nullptr) {
+      if (level < num_ram_mipmap_levels) {
+        // We were told we'd have this many RAM mipmap images, but we
+        // don't.  Raise a warning.
+        GLCAT.warning()
+          << "No mipmap level " << n << " defined for " << tex->get_name()
+          << "\n";
 
-      PTA_uchar bgr_image;
-      size_t page_size = tex->get_ram_mipmap_page_size(n);
-      if (image_ptr != nullptr) {
-        const unsigned char *orig_image_ptr = image_ptr;
-        size_t view_size = tex->get_ram_mipmap_view_size(n);
-        image_ptr += view_size * view;
-        nassertr(image_ptr >= orig_image_ptr && image_ptr + view_size <= orig_image_ptr + tex->get_ram_mipmap_image_size(n), false);
-
-        if (image_compression == Texture::CM_off) {
-          // If the GL doesn't claim to support BGR, we may have to reverse
-          // the component ordering of the image.
-          image_ptr = fix_component_ordering(bgr_image, image_ptr, view_size,
-                                             external_format, tex);
+        if (needs_reload && _supports_texture_max_level) {
+          // Tell the GL we have no more mipmaps for it to use.
+          glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, level - 1);
         }
+        break;
       }
 
-      int width = tex->get_expected_mipmap_x_size(n);
-      int height = tex->get_expected_mipmap_y_size(n);
-
-#ifdef DO_PSTATS
-      _data_transferred_pcollector.add_level(page_size * pages.get_num_on_bits());
-#endif
-      switch (target) {
-#ifndef OPENGLES_1
-      case GL_TEXTURE_3D:
-        if (_supports_3d_texture) {
-          for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
-            int begin = pages.get_subrange_begin(sri);
-            int num_pages = pages.get_subrange_end(sri) - begin;
-            const unsigned char *page_ptr = image_ptr + page_size * begin;
-
-            if (image_compression == Texture::CM_off) {
-              _glTexSubImage3D(target, n - mipmap_bias,
-                               0, 0, begin, width, height, num_pages,
-                               external_format, component_type, page_ptr);
-            } else {
-              _glCompressedTexSubImage3D(target, n - mipmap_bias,
-                                         0, 0, begin, width, height, num_pages,
-                                         external_format,
-                                         page_size * num_pages, page_ptr);
+      if (tex->has_clear_color()) {
+        // The texture has a clear color, so we should fill this mipmap
+        // level to a solid color.
+#ifndef OPENGLES
+        if (target != GL_TEXTURE_BUFFER) {
+          if (_supports_clear_texture && !needs_reload) {
+            // We can do that with the convenient glClearTexImage
+            // function.
+            vector_uchar clear_data = tex->get_clear_data();
+
+            if (pages.has_all_of(0, depth)) {
+              _glClearTexImage(index, level, external_format,
+                               component_type, (void *)&clear_data[0]);
             }
+            else for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
+              int begin = pages.get_subrange_begin(sri);
+              int num_pages = pages.get_subrange_end(sri) - begin;
+              _glClearTexSubImage(index, level, 0, 0, begin,
+                                  width, height, num_pages, external_format,
+                                  component_type, (void *)&clear_data[0]);
+            }
+            continue;
           }
         } else {
-          report_my_gl_errors();
-          return false;
+          if (_supports_clear_buffer && !needs_reload) {
+            // For buffer textures we need to clear the underlying
+            // storage.
+            vector_uchar clear_data = tex->get_clear_data();
+
+            _glClearBufferData(GL_TEXTURE_BUFFER, internal_format, external_format,
+                               component_type, (const void *)&clear_data[0]);
+            continue;
+          }
         }
+#endif  // OPENGLES
+        // Ask the Texture class to create the mipmap level in RAM. It'll
+        // fill it in with the correct clear color, which we can then
+        // upload.
+        ptimage = tex->make_ram_mipmap_image(n);
+        image_ptr = ptimage;
+      }
+      else if (!needs_reload) {
+        // No clear color and no more images, and no storage to create.
         break;
-#endif  // OPENGLES_1
-
-#ifndef OPENGLES
-      case GL_TEXTURE_1D:
-        if (image_compression == Texture::CM_off) {
-          glTexSubImage1D(target, n - mipmap_bias, 0, width,
-                          external_format, component_type, image_ptr);
-        } else {
-          _glCompressedTexSubImage1D(target, n - mipmap_bias, 0, width,
-                                     external_format, page_size, image_ptr);
+      }
+      else if (compressed) {
+        // We can't upload a NULL compressed texture.
+        if (_supports_texture_max_level) {
+          // Tell the GL we have no more mipmaps for it to use.
+          glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, level - 1);
         }
         break;
-#endif  // OPENGLES
+      }
+    }
 
-#ifndef OPENGLES_1
-      case GL_TEXTURE_2D_ARRAY:
-      case GL_TEXTURE_CUBE_MAP_ARRAY:
-        if (_supports_2d_texture_array) {
-          for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
-            int begin = pages.get_subrange_begin(sri);
-            int num_pages = pages.get_subrange_end(sri) - begin;
-            const unsigned char *page_ptr = image_ptr + page_size * begin;
-
-            if (image_compression == Texture::CM_off) {
-              _glTexSubImage3D(target, n - mipmap_bias,
-                               0, 0, begin, width, height, num_pages,
-                               external_format, component_type, page_ptr);
-            } else {
-              _glCompressedTexSubImage3D(target, n - mipmap_bias,
-                                         0, 0, begin, width, height, num_pages,
-                                         external_format,
-                                         page_size * num_pages, page_ptr);
-            }
-          }
-        } else {
-          report_my_gl_errors();
-          return false;
+    // Select the correct view.
+    size_t orig_view_size = 0;
+    size_t orig_page_size = 0;
+    size_t page_size = 0;
+    if (image_ptr != nullptr) {
+      orig_view_size = tex->get_ram_mipmap_view_size(n);
+      if (view > 0) {
+        const unsigned char *orig_image_ptr = image_ptr;
+        image_ptr += orig_view_size * view;
+        nassertd(image_ptr >= orig_image_ptr && image_ptr + orig_view_size <= orig_image_ptr + tex->get_ram_mipmap_image_size(n)) {
+          success = false;
+          break;
         }
-        break;
-#endif  // OPENGLES_1
+      }
 
+      orig_page_size = tex->get_ram_mipmap_page_size(n);
+      page_size = orig_page_size;
+      if (!compressed) {
+        // May need to convert.
+        page_size = get_external_format_components(external_format) * (page_size / num_components);
+      }
+    }
 #ifndef OPENGLES_1
-      case GL_TEXTURE_BUFFER:
-        if (_supports_buffer_texture) {
-          _glBufferSubData(GL_TEXTURE_BUFFER, 0, page_size, image_ptr);
-        } else {
-          report_my_gl_errors();
-          return false;
+    else if (target == GL_TEXTURE_BUFFER) {
+      // page_size for buffer texture indicates the size even for a null image.
+      page_size = tex->get_expected_ram_mipmap_view_size(n);
+    }
+#endif
+
+    // Don't need to update the padded area at the bottom.
+    int sub_height = height;
+    if (n == 0 && (target == GL_TEXTURE_2D || target == GL_TEXTURE_CUBE_MAP)) {
+      sub_height -= tex->get_pad_y_size();
+    }
+
+#if defined(HAVE_THREADS) && !defined(OPENGLES_1)
+    if (mapped_ptr != nullptr) {
+      // Let's make sure we have texture storage (normally this is handled by
+      // the glTexStorage2D calls above, if immutable texture storage is
+      // supported and enabled), it makes other things easier down the line.
+      if (needs_reload) {
+        PStatTimer timer(_create_texture_storage_pcollector);
+        if (!upload_texture_level(true, compressed, target, level,
+                                  width, height, depth, internal_format,
+                                  external_format, component_type,
+                                  nullptr, page_size, pages, usage)) {
+          if (level == 0) {
+            // If level 0 failed to create, this texture is useless.
+            success = false;
+          }
+          else if (_supports_texture_max_level) {
+            // Apparently, this is all it's going to get.
+            glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, level - 1);
+          }
+          break;
         }
-        break;
-#endif  // OPENGLES
+      }
 
-      case GL_TEXTURE_CUBE_MAP:
-        if (_supports_cube_map) {
-          // This is the only texture type that must be specified using separate
-          // per-page calls.
-          if (n == 0) {
-            height = tex->get_y_size() - tex->get_pad_y_size();
+      // Spawn a task to do the upload asynchronously into the PBO.
+      if (image_ptr != nullptr) {
+        void *mapped_level_ptr = (char *)mapped_ptr + pbo_offset;
+
+        if (ptimage) {
+          ptimage.node_ref();
+        }
+        _async_chain->add([=, ptimage = std::move(ptimage), token = counter.make_token()](AsyncTask *task) mutable {
+          {
+            PStatTimer timer(_load_texture_copy_pcollector);
+            copy_image((unsigned char *)mapped_level_ptr, image_ptr, orig_view_size,
+                        external_format, num_components, component_width);
           }
-          for (int z = 0; z < 6; ++z) {
-            if (pages.get_bit(z)) {
-              GLenum page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
-              const unsigned char *page_ptr = image_ptr + page_size * z;
-
-              if (image_compression == Texture::CM_off) {
-                glTexSubImage2D(page_target, n - mipmap_bias, 0, 0, width, height,
-                                external_format, component_type, page_ptr);
-              } else {
-                _glCompressedTexSubImage2D(page_target, n - mipmap_bias,
-                                           0, 0, width, height,
-                                           external_format, page_size, page_ptr);
-              }
-            }
+          if (ptimage) {
+            ptimage.node_unref();
           }
-        } else {
-          report_my_gl_errors();
-          return false;
+
+          token.complete(true);
+          return AsyncTask::DS_done;
+        }, "copy:" + tex->get_name());
+
+        if (async_levels == nullptr) {
+          async_levels = new AsyncLevel[num_levels + 1];
         }
+        async_levels[num_async_levels++] = {width, sub_height, depth, page_size, pbo_offset, std::move(pages)};
+        pbo_offset += page_size * depth;
+        continue;
+      }
+    }
+#endif
+    if (image_ptr != nullptr) {
+      if (page_size != orig_page_size || external_format == GL_RGBA || external_format == GL_RGB) {
+        // If the GL doesn't claim to support BGR, we may have to reverse
+        // the component ordering of the image.
+        PStatTimer timer(_load_texture_copy_pcollector);
+        PTA_uchar new_image = PTA_uchar::empty_array(page_size * depth);
+        copy_image(&new_image[0], image_ptr, orig_view_size,
+                   external_format, num_components, component_width);
+
+        ptimage = std::move(new_image);
+      }
+    }
+
+    // Try updating the existing storage (sub-loading) first.
+    if (!needs_reload) {
+      if (!upload_texture_level(false, compressed, target, level,
+                                width, sub_height, depth, internal_format,
+                                external_format, component_type,
+                                image_ptr, page_size, pages, usage)) {
         break;
+      }
 
-      default:
-        if (image_compression == Texture::CM_off) {
-          if (n == 0) {
-            // It's unfortunate that we can't adjust the width, too, but
-            // TexSubImage2D doesn't accept a row-stride parameter.
-            height = tex->get_y_size() - tex->get_pad_y_size();
-          }
-          glTexSubImage2D(target, n - mipmap_bias, 0, 0, width, height,
-                          external_format, component_type, image_ptr);
-        } else {
-          _glCompressedTexSubImage2D(target, n - mipmap_bias, 0, 0, width, height,
-                                     external_format, page_size, image_ptr);
+      // Did that fail?  If it did, we'll immediately try again, this time
+      // loading the texture from scratch.
+      GLenum error_code = gl_get_error();
+      if (error_code != GL_NO_ERROR) {
+        if (GLCAT.is_warning()) {
+          GLCAT.warning()
+            << "GL texture subload failed for " << tex->get_name()
+            << " level " << level << ": " << get_error_string(error_code) << "\n";
         }
-        break;
+        needs_reload = true;
       }
     }
 
-    // Did that fail?  If it did, we'll immediately try again, this time
-    // loading the texture from scratch.
-    GLenum error_code = gl_get_error();
-    if (error_code != GL_NO_ERROR) {
-      if (GLCAT.is_debug()) {
-        GLCAT.debug()
-          << "GL texture subload failed for " << tex->get_name()
-          << " : " << get_error_string(error_code) << "\n";
+    if (needs_reload) {
+      if (!upload_texture_level(true, compressed, target, level,
+                                width, height, depth, internal_format,
+                                external_format, component_type,
+                                image_ptr, page_size, pages, usage)) {
+
+        if (_supports_texture_max_level) {
+          // Apparently, this is all it's going to get.
+          glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, level);
+        }
+        if (level == 0) {
+          success = false;
+        }
+        break;
       }
-      needs_reload = true;
     }
   }
 
+  // Purely synchronous path, we can finish up the creation now.
+  // Report the error message explicitly if the GL texture creation failed.
   if (needs_reload) {
-    // Load the image up from scratch, creating a new GL Texture object.
-    if (GLCAT.is_debug()) {
-      GLCAT.debug()
-        << "loading new texture object for " << tex->get_name() << " view "
-        << view << ", " << width << " x " << height << " x " << depth
-        << ", mipmaps " << num_ram_mipmap_levels << " / " << num_levels << "\n";
+    GLenum error_code = gl_get_error();
+    if (error_code != GL_NO_ERROR) {
+      GLCAT.error()
+        << "GL texture creation failed for " << tex->get_name()
+        << " : " << get_error_string(error_code) << "\n";
+
+      gtc->_has_storage = false;
+      success = false;
     }
+  }
 
-    // If there is immutable storage, this is impossible to do, and we should
-    // not have gotten here at all.
-    nassertr(!gtc->_immutable, false);
+#if defined(HAVE_THREADS) && !defined(OPENGLES_1)
+  if (async_levels != nullptr) {
+    // Schedule a follow-up task to finish the upload, which needs to happen
+    // with bound context, so we use a special mini job queue for that.
 
-    if (num_ram_mipmap_levels == 0) {
-      if (GLCAT.is_debug()) {
-        GLCAT.debug()
-          << "  (initializing NULL image)\n";
-      }
+    // Storing 0 as last item saves some closure space.
+    async_levels[num_async_levels] = {0};
 
-      if ((external_format == GL_DEPTH_STENCIL) && get_supports_depth_stencil()) {
-#ifdef OPENGLES
-        component_type = GL_UNSIGNED_INT_24_8_OES;
-#else
-        component_type = GL_UNSIGNED_INT_24_8_EXT;
-#endif
-      }
-    }
-
-    for (int n = mipmap_bias; n < num_levels + mipmap_bias; ++n) {
-      const unsigned char *image_ptr = (unsigned char*)tex->get_ram_mipmap_pointer(n);
-      CPTA_uchar ptimage;
-      if (image_ptr == nullptr) {
-        ptimage = tex->get_ram_mipmap_image(n);
-        if (ptimage.is_null()) {
-          if (n - mipmap_bias < num_ram_mipmap_levels) {
-            // We were told we'd have this many RAM mipmap images, but we
-            // don't.  Raise a warning.
-            GLCAT.warning()
-              << "No mipmap level " << n << " defined for " << tex->get_name()
-              << "\n";
-            if (_supports_texture_max_level) {
-              // Tell the GL we have no more mipmaps for it to use.
-              glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, n - mipmap_bias);
+    std::move(counter).then([
+        this, token = std::move(token),
+        async_levels, gtc, view, pbo, pbo_size, upload_count,
+        external_format, component_type, compressed
+      ] (bool success) mutable {
+      call_later([=, token = std::move(token)] () mutable {
+        _glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo);
+        _glUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
+
+        if (gtc->_uploads_finished - upload_count >= 0) {
+          // Updates arrived out of order, so we skip this one, since a newer
+          // update was already finished.
+          GLCAT.info()
+            << "Discarding async update #" << upload_count << " to texture "
+            << gtc->get_texture()->get_name() << "\n";
+          success = false;
+        }
+        else if (view >= gtc->_num_views) {
+          // If we uploaded a view that is no longer needed, we silently
+          // consider it a success, even if the task failed.
+          success = true;
+        }
+        else if (!apply_texture(gtc, view)) {
+          success = false;
+        }
+        else if (success) {
+          PStatTimer timer(_load_texture_pcollector);
+
+          if (gtc->_target == GL_TEXTURE_BUFFER) {
+            // We can use a trick for buffer textures: just swap the "PBO" with
+            // the existing texture storage.  The existing storage becomes the
+            // new PBO.  Note also that buffer textures have no mipmaps.
+            _glTexBuffer(GL_TEXTURE_BUFFER, gtc->_internal_format, pbo);
+            std::swap(pbo, gtc->_buffers[view]);
+          }
+          else {
+            for (int level = 0; async_levels[level].width != 0; ++level) {
+              AsyncLevel &data = async_levels[level];
+              if (!upload_texture_level(false, compressed, gtc->_target, level,
+                                        data.width, data.height, data.depth,
+                                        gtc->_internal_format, external_format,
+                                        component_type, (unsigned char *)data.pbo_offset,
+                                        data.page_size, data.pages, GL_STATIC_DRAW)) {
+                if (_supports_texture_max_level && !gtc->_generate_mipmaps) {
+                  // Apparently, this is all it's going to get.
+                  glTexParameteri(gtc->_target, GL_TEXTURE_MAX_LEVEL, level - 1);
+                }
+                success = false;
+                break;
+              }
             }
-            break;
           }
 
-          if (tex->has_clear_color()) {
-            // Ask the Texture class to create the mipmap level in RAM. It'll
-            // fill it in with the correct clear color, which we can then
-            // upload.
-            ptimage = tex->make_ram_mipmap_image(n);
+          if (success) {
+            gtc->_uploads_finished = upload_count;
           }
-          else if (image_compression != Texture::CM_off) {
-            // We can't upload a NULL compressed texture.
-            if (_supports_texture_max_level) {
-              // Tell the GL we have no more mipmaps for it to use.
-              glTexParameteri(target, GL_TEXTURE_MAX_LEVEL, n - mipmap_bias);
+
+          if (gtc->_generate_mipmaps && _glGenerateMipmap != nullptr) {
+            // We uploaded an image; we may need to generate mipmaps.
+            if (GLCAT.is_debug()) {
+              GLCAT.debug()
+                << "generating mipmaps for texture " << gtc->get_texture()->get_name()
+                << " view " << view << ", " << async_levels[0].width << " x "
+                << async_levels[0].height << " x " << async_levels[0].depth
+                << ", mipmaps = " << gtc->_num_levels
+                << " (async update #" << upload_count << ")\n";
+            }
+            _glGenerateMipmap(gtc->_target);
+          }
+
+          if (success && gtc->get_texture()->get_post_load_store_cache()) {
+            if (!do_extract_texture_data(gtc, view)) {
+              success = false;
             }
-            break;
           }
         }
-        image_ptr = ptimage;
+        _glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
+
+        // This goes back into the pool.
+        gtc->return_pbo(pbo, pbo_size);
+
+        token.complete(success);
+
+        delete[] async_levels;
+      });
+    });
+  } else
+#endif
+  {
+    // This ensures any pending async op will not overwrite what we just did.
+    gtc->_uploads_finished = upload_count;
+
+    if (pbo != 0) {
+      // For whatever reason, we haven't used the PBO, so return it.
+      gtc->return_pbo(pbo, pbo_size);
+    }
+
+    if (gtc->_generate_mipmaps && _glGenerateMipmap != nullptr && !image.is_null()) {
+      // We uploaded an image; we may need to generate mipmaps.
+      if (GLCAT.is_debug()) {
+        GLCAT.debug()
+          << "generating mipmaps for texture " << gtc->get_texture()->get_name() << " view "
+          << view << ", " << width << " x " << height << " x " << depth
+          << ", mipmaps = " << num_levels << "\n";
       }
+      _glGenerateMipmap(gtc->_target);
+    }
 
-      PTA_uchar bgr_image;
-      size_t view_size = tex->get_ram_mipmap_view_size(n);
-      if (image_ptr != nullptr) {
-        const unsigned char *orig_image_ptr = image_ptr;
-        image_ptr += view_size * view;
-        nassertr(image_ptr >= orig_image_ptr && image_ptr + view_size <= orig_image_ptr + tex->get_ram_mipmap_image_size(n), false);
-
-        if (image_compression == Texture::CM_off) {
-          // If the GL doesn't claim to support BGR, we may have to reverse
-          // the component ordering of the image.
-          image_ptr = fix_component_ordering(bgr_image, image_ptr, view_size,
-                                             external_format, tex);
-        }
+    if (gtc->get_texture()->get_post_load_store_cache()) {
+      if (!do_extract_texture_data(gtc, view)) {
+        success = false;
       }
+    }
 
-      int width = tex->get_expected_mipmap_x_size(n);
-      int height = tex->get_expected_mipmap_y_size(n);
-#ifndef OPENGLES_1
-      int depth = tex->get_expected_mipmap_z_size(n);
-#endif
+    token.complete(success);
+  }
 
-#ifdef DO_PSTATS
-      _data_transferred_pcollector.add_level(view_size);
-#endif
-      switch (target) {
-#ifndef OPENGLES  // 1-d textures not supported by OpenGL ES.  Fall through.
-      case GL_TEXTURE_1D:
-        if (image_compression == Texture::CM_off) {
-          glTexImage1D(target, n - mipmap_bias, internal_format,
-                       width, 0, external_format, component_type, image_ptr);
-        } else {
-          _glCompressedTexImage1D(target, n - mipmap_bias, external_format,
-                                  width, 0, view_size, image_ptr);
-        }
-        break;
-#endif  // OPENGLES  // OpenGL ES will fall through.
+  report_my_gl_errors();
 
-#ifndef OPENGLES_1
-      case GL_TEXTURE_3D:
-        if (_supports_3d_texture) {
-          if (image_compression == Texture::CM_off) {
-            _glTexImage3D(target, n - mipmap_bias, internal_format,
-                          width, height, depth, 0,
-                          external_format, component_type, image_ptr);
-          } else {
-            _glCompressedTexImage3D(target, n - mipmap_bias, external_format,
-                                    width, height, depth, 0, view_size, image_ptr);
-          }
-        } else {
-          report_my_gl_errors();
-          return false;
-        }
-        break;
-#endif  // OPENGLES_1
+  return success;
+}
 
+/**
+ * Performs the actual OpenGL call to update the texture data for the given
+ * mipmap level (be sure to subtract the mipmap_bias before passing it in).
+ *
+ * If full_reload is true, recreates the texture storage, otherwise subloads
+ * into the existing texture storage.  A texture storage with undefined
+ * contents can be created by setting image_ptr to nullptr, in which case
+ * compressed must be false.
+ *
+ * Returns true if this texture format was supported, false otherwise.
+ */
+bool CLP(GraphicsStateGuardian)::
+upload_texture_level(bool full_reload, bool compressed, GLenum target,
+                     int level, int width, int height, int depth,
+                     GLint internal_format, GLint external_format,
+                     GLenum component_type, const unsigned char *image_ptr,
+                     size_t page_size, SparseArray pages,
+                     GLenum usage_hint) {
+
+  switch (target) {
 #ifndef OPENGLES_1
-      case GL_TEXTURE_2D_ARRAY:
-      case GL_TEXTURE_CUBE_MAP_ARRAY:
-        if (_supports_2d_texture_array) {
-          if (image_compression == Texture::CM_off) {
-            _glTexImage3D(target, n - mipmap_bias, internal_format,
-                          width, height, depth, 0,
-                          external_format, component_type, image_ptr);
-          } else {
-            _glCompressedTexImage3D(target, n - mipmap_bias, external_format,
-                                    width, height, depth, 0, view_size, image_ptr);
-          }
-        } else {
-          report_my_gl_errors();
-          return false;
-        }
-        break;
+  case GL_TEXTURE_3D:
+    if (!_supports_3d_texture) {
+      return false;
+    }
 
-      case GL_TEXTURE_BUFFER:
-        if (_supports_buffer_texture) {
-          _glBufferData(GL_TEXTURE_BUFFER, view_size, image_ptr,
-                        get_usage(tex->get_usage_hint()));
+    if (full_reload) {
+      if (!compressed) {
+        _glTexImage3D(target, level, internal_format,
+                      width, height, depth, 0,
+                      external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexImage3D(target, level, external_format,
+                                width, height, depth, 0, page_size * depth, image_ptr);
+      }
+    } else {
+      for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
+        int begin = pages.get_subrange_begin(sri);
+        int num_pages = pages.get_subrange_end(sri) - begin;
+        const unsigned char *page_ptr = image_ptr + page_size * begin;
+
+        if (!compressed) {
+          _glTexSubImage3D(target, level,
+                           0, 0, begin, width, height, num_pages,
+                           external_format, component_type, page_ptr);
         } else {
-          report_my_gl_errors();
-          return false;
+          _glCompressedTexSubImage3D(target, level,
+                                     0, 0, begin, width, height, num_pages,
+                                     external_format,
+                                     page_size * num_pages, page_ptr);
         }
-        break;
+      }
+    }
+    break;
 #endif  // OPENGLES_1
 
-      case GL_TEXTURE_CUBE_MAP:
-        if (_supports_cube_map) {
-          // This is the only texture type that must be specified using separate
-          // per-page calls.
-          size_t page_size = tex->get_ram_mipmap_page_size(n);
-          for (int z = 0; z < 6; ++z) {
-            GLenum page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
-            const unsigned char *page_ptr =
-              (image_ptr != nullptr) ? image_ptr + page_size * z : nullptr;
-
-            if (image_compression == Texture::CM_off) {
-              glTexImage2D(page_target, n - mipmap_bias, internal_format,
-                           width, height, 0,
+#ifndef OPENGLES_1
+  case GL_TEXTURE_2D_ARRAY:
+  case GL_TEXTURE_CUBE_MAP_ARRAY:
+    if (!_supports_2d_texture_array) {
+      return false;
+    }
+
+    if (full_reload) {
+      if (!compressed) {
+        _glTexImage3D(target, level, internal_format, width, height, depth, 0,
+                      external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexImage3D(target, level, external_format,
+                                width, height, depth, 0,
+                                page_size * depth, image_ptr);
+      }
+    } else {
+      for (size_t sri = 0; sri < pages.get_num_subranges(); ++sri) {
+        int begin = pages.get_subrange_begin(sri);
+        int num_pages = pages.get_subrange_end(sri) - begin;
+        const unsigned char *page_ptr = image_ptr + page_size * begin;
+
+        if (!compressed) {
+          _glTexSubImage3D(target, level,
+                           0, 0, begin, width, height, num_pages,
                            external_format, component_type, page_ptr);
-            } else {
-              _glCompressedTexImage2D(page_target, n - mipmap_bias, external_format,
-                                      width, height, 0, page_size, page_ptr);
-            }
-          }
         } else {
-          report_my_gl_errors();
-          return false;
+          _glCompressedTexSubImage3D(target, level,
+                                     0, 0, begin, width, height, num_pages,
+                                     external_format,
+                                     page_size * num_pages, page_ptr);
         }
-        break;
+      }
+    }
+    break;
+#endif  // !OPENGLES_1
 
-      default:
-        if (image_compression == Texture::CM_off) {
-          glTexImage2D(target, n - mipmap_bias, internal_format,
-                       width, height, 0,
-                       external_format, component_type, image_ptr);
+#ifndef OPENGLES_1
+  case GL_TEXTURE_BUFFER:
+    if (!_supports_buffer_texture) {
+      return false;
+    }
+
+    if (full_reload) {
+      _glBufferData(GL_TEXTURE_BUFFER, page_size, image_ptr, usage_hint);
+    } else {
+      _glBufferSubData(GL_TEXTURE_BUFFER, 0, page_size, image_ptr);
+    }
+    break;
+#endif  // !OPENGLES_1
+
+  case GL_TEXTURE_CUBE_MAP:
+    if (!_supports_cube_map) {
+      return false;
+    }
+
+    // This is the only texture type that must be specified using separate
+    // per-page calls.
+    if (full_reload) {
+      for (int z = 0; z < 6; ++z) {
+        GLenum page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
+        const unsigned char *page_ptr =
+          (image_ptr != nullptr) ? image_ptr + page_size * z : nullptr;
+
+        if (!compressed) {
+          glTexImage2D(page_target, level, internal_format, width, height, 0,
+                       external_format, component_type, page_ptr);
         } else {
-          _glCompressedTexImage2D(target, n - mipmap_bias, external_format,
-                                  width, height, 0, view_size, image_ptr);
+          _glCompressedTexImage2D(page_target, level, external_format,
+                                  width, height, 0, page_size, page_ptr);
+        }
+      }
+    } else {
+      for (int z = 0; z < 6; ++z) {
+        if (pages.get_bit(z)) {
+          GLenum page_target = GL_TEXTURE_CUBE_MAP_POSITIVE_X + z;
+          const unsigned char *page_ptr = image_ptr + page_size * z;
+
+          if (!compressed) {
+            glTexSubImage2D(page_target, level, 0, 0, width, height,
+                            external_format, component_type, page_ptr);
+          } else {
+            _glCompressedTexSubImage2D(page_target, level,
+                                       0, 0, width, height,
+                                       external_format, page_size, page_ptr);
+          }
         }
       }
     }
+    break;
 
-    // Report the error message explicitly if the GL texture creation failed.
-    GLenum error_code = gl_get_error();
-    if (error_code != GL_NO_ERROR) {
-      GLCAT.error()
-        << "GL texture creation failed for " << tex->get_name()
-        << " : " << get_error_string(error_code) << "\n";
+#ifndef OPENGLES
+  case GL_TEXTURE_1D:
+    if (full_reload) {
+      if (!compressed) {
+        glTexImage1D(target, level, internal_format,
+                     width, 0, external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexImage1D(target, level, external_format,
+                                width, 0, page_size, image_ptr);
+      }
+    } else {
+      if (!compressed) {
+        glTexSubImage1D(target, level, 0, width,
+                        external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexSubImage1D(target, level, 0, width,
+                                   external_format, page_size, image_ptr);
+      }
+    }
+    break;
+#endif  // !OPENGLES
 
-      gtc->_has_storage = false;
-      return false;
+  default:
+    if (full_reload) {
+      if (!compressed) {
+        glTexImage2D(target, level, internal_format, width, height, 0,
+                     external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexImage2D(target, level, external_format,
+                                width, height, 0, page_size, image_ptr);
+      }
+    } else {
+      if (!compressed) {
+        glTexSubImage2D(target, level, 0, 0, width, height,
+                        external_format, component_type, image_ptr);
+      } else {
+        _glCompressedTexSubImage2D(target, level, 0, 0, width, height,
+                                   external_format, page_size, image_ptr);
+      }
     }
+    break;
+  }
+
+#ifdef DO_PSTATS
+  if (full_reload) {
+    _data_transferred_pcollector.add_level(page_size * depth);
+  } else {
+    _data_transferred_pcollector.add_level(pages.get_num_on_bits() * depth);
   }
+#endif
 
-  if (gtc->_generate_mipmaps && _glGenerateMipmap != nullptr && !image.is_null()) {
-    // We uploaded an image; we may need to generate mipmaps.
+  // Did that fail?  If it did, we'll immediately try again, this time
+  // loading the texture from scratch.
+  /*GLenum error_code = gl_get_error();
+  if (error_code != GL_NO_ERROR) {
     if (GLCAT.is_debug()) {
       GLCAT.debug()
-        << "generating mipmaps for texture " << tex->get_name() << " view "
-        << view << ", " << width << " x " << height << " x " << depth
-        << ", mipmaps = " << num_levels << "\n";
+        << "GL texture subload failed for " << tex->get_name()
+        << " : " << get_error_string(error_code) << "\n";
     }
-    _glGenerateMipmap(target);
-  }
-
-  report_my_gl_errors();
-
+    full_reload = true;
+  }*/
   return true;
 }
 
@@ -15171,7 +15591,7 @@ do_extract_texture_data(CLP(TextureContext) *gtc, int view) {
 
 #ifndef OPENGLES_1
   // Make sure any incoherent writes to the texture have been synced.
-  if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT)) {
+  if (gtc->needs_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT, false)) {
     issue_memory_barrier(GL_TEXTURE_UPDATE_BARRIER_BIT);
   }
 #endif
@@ -16174,3 +16594,160 @@ do_issue_scissor() {
     }
   }
 }
+
+#ifndef OPENGLES_1
+/**
+ * Maps a buffer for reading.  May be temporarily bound to the given target.
+ */
+void *CLP(GraphicsStateGuardian)::
+map_read_buffer(GLenum target, GLuint buffer, size_t size) {
+  nassertr(buffer != 0, nullptr);
+
+#ifndef OPENGLES
+  if (_glMapNamedBufferRange != nullptr) {
+    return _glMapNamedBufferRange(buffer, 0, size, GL_MAP_READ_BIT);
+  }
+#endif
+
+  void *mapped_ptr = nullptr;
+
+  _glBindBuffer(target, buffer);
+#ifdef OPENGLES
+  // There is neither glMapBuffer nor persistent mapping in OpenGL ES
+  mapped_ptr = _glMapBufferRange(target, 0, size, GL_MAP_READ_BIT);
+#else
+  // If we get here in desktop GL, we must not have persistent mapping
+  mapped_ptr = _glMapBuffer(target, GL_READ_ONLY);
+#endif
+
+  _glBindBuffer(target, 0);
+  return mapped_ptr;
+}
+
+/**
+ * Maps a buffer as write-only, discarding the previous contents.  If
+ * create_storage is true, allocates new storage for the buffer.  May use the
+ * given target to temporarily bind the buffer, if DSA is not supported.
+ */
+void *CLP(GraphicsStateGuardian)::
+map_write_discard_buffer(GLenum target, GLuint buffer, size_t size,
+                         bool create_storage) {
+  nassertr(buffer != 0, nullptr);
+
+#ifndef OPENGLES
+  if (!create_storage && _glMapNamedBufferRange != nullptr) {
+    return _glMapNamedBufferRange(buffer, 0, size, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+  }
+#endif
+
+  _glBindBuffer(target, buffer);
+
+  void *mapped_ptr;
+  if (_glMapBufferRange != nullptr) {
+    if (create_storage) {
+      if (_supports_buffer_storage) {
+        _glBufferStorage(target, size, nullptr, GL_MAP_WRITE_BIT);
+      } else {
+        _glBufferData(target, size, nullptr, GL_STATIC_DRAW);
+      }
+    }
+    mapped_ptr = _glMapBufferRange(target, 0, size, GL_MAP_WRITE_BIT | GL_MAP_INVALIDATE_BUFFER_BIT);
+  } else {
+#ifdef OPENGLES
+    mapped_ptr = nullptr;
+#else
+    // Explicitly orphan the buffer before mapping.
+    _glBufferData(target, size, nullptr, GL_STATIC_DRAW);
+    mapped_ptr = _glMapBuffer(target, GL_WRITE_ONLY);
+#endif
+  }
+
+  _glBindBuffer(target, 0);
+  return mapped_ptr;
+}
+#endif  // !OPENGLES_1
+
+#ifndef OPENGLES_1
+/**
+ * Inserts a fence into the command stream.
+ */
+void CLP(GraphicsStateGuardian)::
+insert_fence(CompletionToken &&callback) {
+  GLsync fence = _glFenceSync(GL_SYNC_GPU_COMMANDS_COMPLETE, 0);
+  _fences.push_back({fence, std::move(callback)});
+}
+
+/**
+ * Checks which fences are finished and processes those.
+ */
+void CLP(GraphicsStateGuardian)::
+process_fences(bool force) {
+  if (_fences.empty()) {
+    return;
+  }
+
+  PStatTimer timer(_copy_texture_finish_pcollector);
+
+  if (force) {
+    // Just wait for the last fence, the rest must be complete too then.
+    PStatTimer timer(_wait_fence_pcollector);
+    GLsync fence = _fences.back()._object;
+    _glClientWaitSync(fence, 0, (GLuint64)-1);
+  }
+
+  while (!_fences.empty()) {
+    Fence &fence = _fences.front();
+    if (!force) {
+      GLenum result = _glClientWaitSync(fence._object, 0, 0);
+      if (result != GL_ALREADY_SIGNALED && result != GL_CONDITION_SATISFIED) {
+        // Not yet done.  The rest must not yet be done then, either.
+        break;
+      }
+    }
+    _glDeleteSync(fence._object);
+
+    std::move(fence._token).complete(true);
+    _fences.pop_front();
+
+    // If there is 1 remaining, save it for next frame.  This helps prevent an
+    // inconsistent frame rate when the number of fetched frames alternates
+    // between 0 and 2, which can settle into a stable feedback loop.
+    if (!force && _fences.size() == 1) {
+      break;
+    }
+  }
+}
+#endif  // !OPENGLES_1
+
+/**
+ * Adds a job to the queue to be processed later while the context is bound,
+ * useful for calling from other threads.
+ */
+void CLP(GraphicsStateGuardian)::
+call_later(Completable &&job) {
+  MutexHolder holder(_job_queue_mutex);
+  _job_queue.push_back(std::move(job));
+  _job_queue_cvar.notify();
+}
+
+/**
+ * Processes any pending jobs from the queue.  If wait is true, waits for at
+ * least one job if the queue is empty.
+ *
+ * May only be called on the draw thread.
+ */
+void CLP(GraphicsStateGuardian)::
+process_pending_jobs(bool wait) {
+  JobQueue jobs;
+  {
+    MutexHolder holder(_job_queue_mutex);
+    if (wait && _job_queue.empty()) {
+      _job_queue_cvar.wait();
+    }
+    _job_queue.swap(jobs);
+  }
+
+  for (auto &job : jobs) {
+    std::move(job)();
+  }
+}
diff --git a/panda/src/glstuff/glGraphicsStateGuardian_src.h b/panda/src/glstuff/glGraphicsStateGuardian_src.h
index 03f7a0097d7..a355962005e 100644
--- a/panda/src/glstuff/glGraphicsStateGuardian_src.h
+++ b/panda/src/glstuff/glGraphicsStateGuardian_src.h
@@ -39,6 +39,8 @@
 #include "geomVertexArrayData.h"
 #include "lightMutex.h"
 #include "pStatGPUTimer.h"
+#include "completionToken.h"
+#include "asyncTaskChain.h"
 
 class PlaneNode;
 class Light;
@@ -230,6 +232,7 @@ typedef void (APIENTRYP PFNGLGETPROGRAMBINARYPROC) (GLuint program, GLsizei bufS
 typedef void (APIENTRYP PFNGLPROGRAMBINARYPROC) (GLuint program, GLenum binaryFormat, const void *binary, GLsizei length);
 typedef void (APIENTRYP PFNGLGETINTERNALFORMATIVPROC) (GLenum target, GLenum internalformat, GLenum pname, GLsizei bufSize, GLint *params);
 typedef void (APIENTRYP PFNGLBUFFERSTORAGEPROC) (GLenum target, GLsizeiptr size, const void *data, GLbitfield flags);
+typedef void (APIENTRYP PFNGLCOPYBUFFERSUBDATAPROC) (GLenum readTarget, GLenum writeTarget, GLintptr readOffset, GLintptr writeOffset, GLsizeiptr size);
 typedef void (APIENTRYP PFNGLBINDIMAGETEXTUREPROC) (GLuint unit, GLuint texture, GLint level, GLboolean layered, GLint layer, GLenum access, GLenum format);
 typedef void (APIENTRYP PFNGLCLEARTEXIMAGEPROC) (GLuint texture, GLint level, GLenum format, GLenum type, const void *data);
 typedef void (APIENTRYP PFNGLCLEARTEXSUBIMAGEPROC) (GLuint texture, GLint level, GLint xoffset, GLint yoffset, GLint zoffset, GLsizei width, GLsizei height, GLsizei depth, GLenum format, GLenum type, const void *data);
@@ -345,7 +348,8 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
 #endif
 
   virtual TextureContext *prepare_texture(Texture *tex);
-  virtual bool update_texture(TextureContext *tc, bool force);
+  virtual bool update_texture(TextureContext *tc, bool force,
+                              CompletionToken token = CompletionToken());
   virtual void release_texture(TextureContext *tc);
   virtual void release_textures(const pvector<TextureContext *> &contexts);
   virtual bool extract_texture_data(Texture *tex);
@@ -419,7 +423,6 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   virtual bool framebuffer_copy_to_ram
     (Texture *tex, int view, int z, const DisplayRegion *dr, const RenderBuffer &rb,
      ScreenshotRequest *request);
-  void finish_async_framebuffer_ram_copies(bool force = false);
 
 #ifdef SUPPORT_FIXED_FUNCTION
   void apply_fog(Fog *fog);
@@ -637,12 +640,21 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   bool apply_texture(CLP(TextureContext) *gtc, int view);
   bool apply_sampler(GLuint unit, const SamplerState &sampler,
                      CLP(TextureContext) *gtc, int view);
-  bool upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps);
-  bool upload_texture_image(CLP(TextureContext) *gtc, int view,
-                            bool needs_reload, int mipmap_bias, int num_levels,
+  bool upload_texture(CLP(TextureContext) *gtc, bool force, bool uses_mipmaps,
+                      CompletionToken token = CompletionToken());
+  bool upload_texture_view(CLP(TextureContext) *gtc, int view,
+                           bool needs_reload, int mipmap_bias, int num_levels,
+                           GLint internal_format, GLint external_format,
+                           GLenum component_type, bool compressed,
+                           int async_buffers, CompletionToken token);
+  bool upload_texture_level(bool full_reload, bool compressed,
+                            GLenum target, int level,
+                            int width, int height, int depth,
                             GLint internal_format, GLint external_format,
                             GLenum component_type,
-                            Texture::CompressionMode image_compression);
+                            const unsigned char *image_ptr,
+                            size_t page_size, SparseArray pages,
+                            GLenum usage_hint);
   void generate_mipmaps(CLP(TextureContext) *gtc);
   bool upload_simple_texture(CLP(TextureContext) *gtc);
 
@@ -658,6 +670,20 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   void do_point_size();
 #endif
 
+#ifndef OPENGLES_1
+  void *map_read_buffer(GLenum target, GLuint buffer, size_t size);
+  void *map_write_discard_buffer(GLenum target, GLuint buffer, size_t size,
+                                 bool create_storage);
+#endif
+
+#ifndef OPENGLES_1
+  void insert_fence(CompletionToken &&callback);
+  void process_fences(bool force);
+#endif
+
+  void call_later(Completable &&job);
+  void process_pending_jobs(bool wait);
+
   enum AutoAntialiasMode {
     AA_poly,
     AA_line,
@@ -904,6 +930,10 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   PFNGLGETBUFFERSUBDATAPROC _glGetBufferSubData;
 #endif
 
+#ifndef OPENGLES_1
+  PFNGLCOPYBUFFERSUBDATAPROC _glCopyBufferSubData;
+#endif
+
 #ifdef OPENGLES
   PFNGLMAPBUFFERRANGEEXTPROC _glMapBufferRange;
   PFNGLUNMAPBUFFEROESPROC _glUnmapBuffer;
@@ -911,6 +941,10 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   PFNGLMAPBUFFERRANGEPROC _glMapBufferRange;
 #endif
 
+#ifndef OPENGLES_1
+  bool _supports_pixel_buffers;
+#endif
+
 #ifndef OPENGLES_1
   bool _supports_uniform_buffers;
   bool _supports_shader_buffers;
@@ -978,6 +1012,7 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   PFNGLTEXTUREPARAMETERIPROC _glTextureParameteri;
   PFNGLGENERATETEXTUREMIPMAPPROC _glGenerateTextureMipmap;
   PFNGLBINDTEXTUREUNITPROC _glBindTextureUnit;
+  PFNGLMAPNAMEDBUFFERRANGEPROC _glMapNamedBufferRange;
 #endif
 
 #ifndef OPENGLES_1
@@ -1162,12 +1197,14 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
 #endif
 
 #ifndef OPENGLES_1
-  // Stores textures for which memory bariers should be issued.
-  typedef pset<TextureContext*> TextureSet;
-  TextureSet _textures_needing_fetch_barrier;
-  TextureSet _textures_needing_image_access_barrier;
-  TextureSet _textures_needing_update_barrier;
-  TextureSet _textures_needing_framebuffer_barrier;
+  // This count increments every time the corresponding barrier is issued.
+  // GLTextureContext et al store copies of this counter, when a write is
+  // performed on a texture, it will set its counter to match the value on the
+  // GSG to indicate that it is out of sync and the barrier needs to be issued.
+  int _texture_fetch_barrier_counter = 0;
+  int _shader_image_access_barrier_counter = 0;
+  int _texture_update_barrier_counter = 0;
+  int _framebuffer_barrier_counter = 0;
   int _shader_storage_barrier_counter = 0;
 #endif
 
@@ -1218,16 +1255,21 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   FrameTiming *_current_frame_timing = nullptr;
 #endif
 
-  struct AsyncRamCopy {
-    PT(ScreenshotRequest) _request;
-    GLuint _pbo;
-    GLsync _fence;
-    GLuint _external_format;
-    int _view;
-    void *_mapped_pointer;
-    size_t _size;
+  struct Fence {
+    GLsync _object;
+    CompletionToken _token;
   };
-  pdeque<AsyncRamCopy> _async_ram_copies;
+  pdeque<Fence> _fences;
+
+#ifdef HAVE_THREADS
+  AsyncTaskChain *_async_chain;
+#endif
+
+  // Min job system pending a real job system
+  typedef pvector<Completable> JobQueue;
+  Mutex _job_queue_mutex;
+  ConditionVar _job_queue_cvar;
+  JobQueue _job_queue;
 
   BufferResidencyTracker _renderbuffer_residency;
 
@@ -1272,6 +1314,7 @@ class EXPCL_GL CLP(GraphicsStateGuardian) : public GraphicsStateGuardian {
   friend class CLP(BufferContext);
   friend class CLP(ShaderContext);
   friend class CLP(CgShaderContext);
+  friend class CLP(TextureContext);
   friend class CLP(GraphicsBuffer);
   friend class CLP(OcclusionQueryContext);
 };
diff --git a/panda/src/glstuff/glShaderContext_src.cxx b/panda/src/glstuff/glShaderContext_src.cxx
index dde88cc5546..f928882d2f4 100644
--- a/panda/src/glstuff/glShaderContext_src.cxx
+++ b/panda/src/glstuff/glShaderContext_src.cxx
@@ -2813,12 +2813,6 @@ update_shader_texture_bindings(ShaderContext *prev) {
 
           int view = _glgsg->get_current_tex_view_offset();
           gl_tex = gtc->get_view_index(view);
-
-#ifndef OPENGLES
-          if (gtc->needs_barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT)) {
-            barriers |= GL_SHADER_IMAGE_ACCESS_BARRIER_BIT;
-          }
-#endif
         }
       }
       input._writable = false;
@@ -2879,7 +2873,17 @@ update_shader_texture_bindings(ShaderContext *prev) {
             access = GL_READ_ONLY;
             gl_tex = 0;
           }
+        } else {
+          // If no parameters were specified, we have to assume writable access.
+          input._writable = true;
         }
+
+#ifndef OPENGLES
+        if (gtc->needs_barrier(GL_SHADER_IMAGE_ACCESS_BARRIER_BIT, input._writable)) {
+          barriers |= GL_SHADER_IMAGE_ACCESS_BARRIER_BIT;
+        }
+#endif
+
         _glgsg->_glBindImageTexture(i, gl_tex, bind_level, layered, bind_layer,
                                     access, gtc->_internal_format);
       }
@@ -2969,7 +2973,7 @@ update_shader_texture_bindings(ShaderContext *prev) {
 #ifndef OPENGLES
     // If it was recently written to, we will have to issue a memory barrier
     // soon.
-    if (gtc->needs_barrier(GL_TEXTURE_FETCH_BARRIER_BIT)) {
+    if (gtc->needs_barrier(GL_TEXTURE_FETCH_BARRIER_BIT, false)) {
       barriers |= GL_TEXTURE_FETCH_BARRIER_BIT;
     }
 #endif
diff --git a/panda/src/glstuff/glTextureContext_src.I b/panda/src/glstuff/glTextureContext_src.I
index bde6a4d977a..a0a986e80f9 100644
--- a/panda/src/glstuff/glTextureContext_src.I
+++ b/panda/src/glstuff/glTextureContext_src.I
@@ -59,3 +59,43 @@ get_view_buffer(int view) const {
     return 0;
   }
 }
+
+/**
+ * Returns true if an async upload is pending.
+ */
+INLINE bool CLP(TextureContext)::
+is_upload_pending() const {
+  // We can't simply compare _uploads_started to _uploads_finished, since
+  // they also get set to the same by cancel_pending_uploads()
+  return _uploads_pending > 0;
+}
+
+/**
+ * Waits for all uploads to be finished.
+ */
+INLINE void CLP(TextureContext)::
+wait_pending_uploads() const {
+  if (is_upload_pending()) {
+    do_wait_pending_uploads();
+  }
+}
+
+/**
+ * Cancels all asynchronous uploads.  Not guaranteed to be cancelled by the
+ * time this returns, consider following this up with a call to
+ * wait_pending_uploads().
+ */
+INLINE void CLP(TextureContext)::
+cancel_pending_uploads() {
+  _uploads_finished = _uploads_started;
+}
+
+/**
+ * Waits for an unused PBO unless we're not at the given limit of PBOs yet.
+ */
+INLINE void CLP(TextureContext)::
+wait_for_unused_pbo(int limit) const {
+  if (_unused_pbos.empty() && _num_pbos >= limit) {
+    do_wait_for_unused_pbo(limit);
+  }
+}
diff --git a/panda/src/glstuff/glTextureContext_src.cxx b/panda/src/glstuff/glTextureContext_src.cxx
index 7572ef8a4f4..efc001df347 100644
--- a/panda/src/glstuff/glTextureContext_src.cxx
+++ b/panda/src/glstuff/glTextureContext_src.cxx
@@ -13,6 +13,8 @@
 
 #include "pnotify.h"
 
+static PStatCollector _wait_async_texture_uploads_pcollector("Wait:Async Texture Uploads");
+
 TypeHandle CLP(TextureContext)::_type_handle;
 
 /**
@@ -48,6 +50,8 @@ evict_lru() {
  */
 void CLP(TextureContext)::
 reset_data(GLenum target, int num_views) {
+  cancel_pending_uploads();
+
   // Free the texture resources.
   set_num_views(0);
 
@@ -63,12 +67,13 @@ reset_data(GLenum target, int num_views) {
 
 #ifndef OPENGLES_1
   // Mark the texture as coherent.
-  if (gl_enable_memory_barriers) {
-    _glgsg->_textures_needing_fetch_barrier.erase(this);
-    _glgsg->_textures_needing_image_access_barrier.erase(this);
-    _glgsg->_textures_needing_update_barrier.erase(this);
-    _glgsg->_textures_needing_framebuffer_barrier.erase(this);
-  }
+  _texture_fetch_barrier_counter = _glgsg->_texture_fetch_barrier_counter - 1;
+  _shader_image_read_barrier_counter = _glgsg->_shader_image_access_barrier_counter - 1;
+  _shader_image_write_barrier_counter = _glgsg->_shader_image_access_barrier_counter - 1;
+  _texture_read_barrier_counter = _glgsg->_texture_update_barrier_counter - 1;
+  _texture_write_barrier_counter = _glgsg->_shader_image_access_barrier_counter - 1;
+  _framebuffer_read_barrier_counter = _glgsg->_framebuffer_barrier_counter - 1;
+  _framebuffer_write_barrier_counter = _glgsg->_framebuffer_barrier_counter - 1;
 #endif
 }
 
@@ -168,26 +173,50 @@ set_num_views(int num_views) {
 
 #ifndef OPENGLES_1
 /**
- *
+ * Returns true if the texture needs a barrier before a read or write of the
+ * given kind.  If writing is false, only writes are synced, otherwise both
+ * reads and writes are synced.
  */
 bool CLP(TextureContext)::
-needs_barrier(GLbitfield barrier) {
+needs_barrier(GLbitfield barrier, bool writing) {
   if (!gl_enable_memory_barriers) {
     return false;
   }
 
-  return (((barrier & GL_TEXTURE_FETCH_BARRIER_BIT) &&
-           _glgsg->_textures_needing_fetch_barrier.count(this)))
-      || (((barrier & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT) &&
-           _glgsg->_textures_needing_image_access_barrier.count(this)))
-      || (((barrier & GL_TEXTURE_UPDATE_BARRIER_BIT) &&
-           _glgsg->_textures_needing_update_barrier.count(this)))
-      || (((barrier & GL_FRAMEBUFFER_BARRIER_BIT) &&
-           _glgsg->_textures_needing_framebuffer_barrier.count(this)));
+  if (barrier & GL_TEXTURE_FETCH_BARRIER_BIT) {
+    // This is always a read, so only sync RAW.
+    if (_glgsg->_texture_fetch_barrier_counter == _texture_fetch_barrier_counter) {
+      return true;
+    }
+  }
+
+  if (barrier & GL_SHADER_IMAGE_ACCESS_BARRIER_BIT) {
+    // Sync WAR, WAW and RAW, but not RAR.
+    if ((writing && _glgsg->_shader_image_access_barrier_counter == _shader_image_read_barrier_counter) ||
+        (_glgsg->_shader_image_access_barrier_counter == _shader_image_write_barrier_counter)) {
+      return true;
+    }
+  }
+
+  if (barrier & GL_TEXTURE_UPDATE_BARRIER_BIT) {
+    if ((writing && _glgsg->_texture_update_barrier_counter == _texture_read_barrier_counter) ||
+        (_glgsg->_texture_update_barrier_counter == _texture_write_barrier_counter)) {
+      return true;
+    }
+  }
+
+  if (barrier & GL_FRAMEBUFFER_BARRIER_BIT) {
+    if ((writing && _glgsg->_framebuffer_barrier_counter == _framebuffer_read_barrier_counter) ||
+        (_glgsg->_framebuffer_barrier_counter == _framebuffer_write_barrier_counter)) {
+      return true;
+    }
+  }
+
+  return false;
 }
 
 /**
- * Mark a texture as needing a memory barrier, since a non-coherent read or
+ * Mark a texture as needing a memory barrier, since an unsynchronized read or
  * write just happened to it.  If 'wrote' is true, it was written to.
  */
 void CLP(TextureContext)::
@@ -199,16 +228,73 @@ mark_incoherent(bool wrote) {
   // If we only read from it, the next read operation won't need another
   // barrier, since it'll be reading the same data.
   if (wrote) {
-    _glgsg->_textures_needing_fetch_barrier.insert(this);
+    _texture_fetch_barrier_counter = _glgsg->_texture_fetch_barrier_counter;
+    _shader_image_write_barrier_counter = _glgsg->_shader_image_access_barrier_counter;
+    _texture_write_barrier_counter = _glgsg->_shader_image_access_barrier_counter;
+    _framebuffer_write_barrier_counter = _glgsg->_framebuffer_barrier_counter;
   }
 
   // We could still write to it before we read from it, so we have to always
-  // insert these barriers.  This could be slightly optimized so that we don't
-  // issue a barrier between consecutive image reads, but that may not be
-  // worth the trouble.
-  _glgsg->_textures_needing_image_access_barrier.insert(this);
-  _glgsg->_textures_needing_update_barrier.insert(this);
-  _glgsg->_textures_needing_framebuffer_barrier.insert(this);
+  // insert these barriers.
+  _shader_image_read_barrier_counter = _glgsg->_shader_image_access_barrier_counter;
+  _texture_read_barrier_counter = _glgsg->_texture_update_barrier_counter;
+  _framebuffer_read_barrier_counter = _glgsg->_framebuffer_barrier_counter;
 }
 
 #endif  // !OPENGLES_1
+
+/**
+ * Returns a PBO with the given size to the pool of unused PBOs.
+ */
+void CLP(TextureContext)::
+return_pbo(GLuint pbo, size_t size) {
+  // Also triggers when the number of buffers is -1 (which effectively means
+  // to always delete the buffers after use).
+  if (_num_pbos > get_texture()->get_num_async_transfer_buffers() ||
+      size < _pbo_size) {
+    // We have too many PBOs, or this PBO is no longer of the proper
+    // size, so delete it rather than returning it to the pool.
+    _num_pbos--;
+    _glgsg->_glDeleteBuffers(1, &pbo);
+  } else {
+    _unused_pbos.push_front(pbo);
+  }
+}
+
+/**
+ * Deletes all unused PBOs.
+ */
+void CLP(TextureContext)::
+delete_unused_pbos() {
+  if (!_unused_pbos.empty()) {
+    for (GLuint pbo : _unused_pbos) {
+      _glgsg->_glDeleteBuffers(1, &pbo);
+    }
+    _num_pbos -= (int)_unused_pbos.size();
+    _unused_pbos.clear();
+  }
+}
+
+/**
+ * Waits for all uploads to be finished.
+ */
+void CLP(TextureContext)::
+do_wait_pending_uploads() const {
+  PStatTimer timer(_wait_async_texture_uploads_pcollector);
+  do {
+    _glgsg->process_pending_jobs(true);
+  }
+  while (is_upload_pending());
+}
+
+/**
+ *
+ */
+void CLP(TextureContext)::
+do_wait_for_unused_pbo(int limit) const {
+  PStatTimer timer(_wait_async_texture_uploads_pcollector);
+  do {
+    _glgsg->process_pending_jobs(true);
+  }
+  while (_unused_pbos.empty() && _num_pbos >= limit);
+}
diff --git a/panda/src/glstuff/glTextureContext_src.h b/panda/src/glstuff/glTextureContext_src.h
index c4244884714..03626a46ce7 100644
--- a/panda/src/glstuff/glTextureContext_src.h
+++ b/panda/src/glstuff/glTextureContext_src.h
@@ -41,12 +41,24 @@ class EXPCL_GL CLP(TextureContext) : public TextureContext {
   INLINE GLuint get_view_buffer(int view) const;
 
 #ifdef OPENGLES_1
-  static constexpr bool needs_barrier(GLbitfield barrier) { return false; };
+  static constexpr bool needs_barrier(GLbitfield barrier, bool writing) { return false; };
 #else
-  bool needs_barrier(GLbitfield barrier);
+  bool needs_barrier(GLbitfield barrier, bool writing);
   void mark_incoherent(bool wrote);
 #endif
 
+  INLINE bool is_upload_pending() const;
+  INLINE void wait_pending_uploads() const;
+  INLINE void cancel_pending_uploads();
+
+  void return_pbo(GLuint pbo, size_t size);
+  void delete_unused_pbos();
+  INLINE void wait_for_unused_pbo(int limit) const;
+
+private:
+  void do_wait_pending_uploads() const;
+  void do_wait_for_unused_pbo(int limit) const;
+
 private:
   // This is the GL "name" of the texture object.
   GLuint _index;
@@ -76,8 +88,25 @@ class EXPCL_GL CLP(TextureContext) : public TextureContext {
   GLenum _target;
   SamplerState _active_sampler;
 
+  // These counters are used to prevent out-of-order updates.
+  int _uploads_started = 0;
+  int _uploads_finished = 0;
+  int _uploads_pending = 0;
+  pdeque<GLuint> _unused_pbos;
+  int _num_pbos = 0;
+  size_t _pbo_size = 0;
+
   CLP(GraphicsStateGuardian) *_glgsg;
 
+  // These are set to the equivalent counter in glgsg when a write is performed.
+  int _texture_fetch_barrier_counter = -1;
+  int _shader_image_read_barrier_counter = -1;
+  int _shader_image_write_barrier_counter = -1;
+  int _texture_read_barrier_counter = -1;
+  int _texture_write_barrier_counter = -1;
+  int _framebuffer_read_barrier_counter = -1;
+  int _framebuffer_write_barrier_counter = -1;
+
 public:
   static TypeHandle get_class_type() {
     return _type_handle;
diff --git a/panda/src/glstuff/glmisc_src.cxx b/panda/src/glstuff/glmisc_src.cxx
index f223ef266e7..d1ae931c7ce 100644
--- a/panda/src/glstuff/glmisc_src.cxx
+++ b/panda/src/glstuff/glmisc_src.cxx
@@ -22,6 +22,11 @@ ConfigVariableBool gl_forward_compatible
    PRC_DESC("Setting this to true will request a forward-compatible OpenGL "
             "context, which will not support the fixed-function pipeline."));
 
+ConfigVariableBool gl_support_dsa
+  ("gl-support-dsa", true,
+   PRC_DESC("Configure this false if you suspect your GL's implementation of "
+            "Direct State Access is broken."));
+
 ConfigVariableBool gl_support_fbo
   ("gl-support-fbo", true,
    PRC_DESC("Configure this false if your GL's implementation of "
@@ -321,6 +326,19 @@ ConfigVariableBool gl_depth_zero_to_one
             "range from 0 to 1, matching other graphics APIs.  This setting "
             "requires OpenGL 4.5, or NVIDIA GeForce 8+ hardware."));
 
+ConfigVariableInt gl_texture_transfer_num_threads
+ ("gl-texture-transfer-num-threads", 2,
+  PRC_DESC("The number of threads that will be started to upload and download "
+           "texture data asynchronously, either via the setup_async_transfer "
+           "interface on the the Texture class or via the async screenshot "
+           "interface."));
+
+ConfigVariableEnum<ThreadPriority> gl_texture_transfer_thread_priority
+ ("gl-texture-transfer-thread-priority", TP_normal,
+  PRC_DESC("The default thread priority to assign to the threads created for "
+           "asynchronous texture transfers.  The default is 'normal'; you may "
+           "also specify 'low', 'high', or 'urgent'."));
+
 extern ConfigVariableBool gl_parallel_arrays;
 
 void CLP(init_classes)() {
diff --git a/panda/src/glstuff/glmisc_src.h b/panda/src/glstuff/glmisc_src.h
index 1cc96726fe5..deb219dc5a9 100644
--- a/panda/src/glstuff/glmisc_src.h
+++ b/panda/src/glstuff/glmisc_src.h
@@ -17,6 +17,7 @@
 #include "configVariableEnum.h"
 #include "geomEnums.h"
 #include "coordinateSystem.h"
+#include "threadPriority.h"
 
 // Define some macros to transparently map to the double or float versions of
 // the OpenGL function names.
@@ -35,6 +36,7 @@
 extern EXPCL_GL ConfigVariableInt gl_version;
 extern EXPCL_GL ConfigVariableBool gl_forward_compatible;
 extern EXPCL_GL ConfigVariableBool gl_support_fbo;
+extern ConfigVariableBool gl_support_dsa;
 extern ConfigVariableBool gl_cheap_textures;
 extern ConfigVariableBool gl_ignore_clamp;
 extern ConfigVariableBool gl_support_clamp_to_border;
@@ -75,6 +77,8 @@ extern ConfigVariableBool gl_support_shadow_filter;
 extern ConfigVariableBool gl_support_vertex_array_bgra;
 extern ConfigVariableBool gl_force_image_bindings_writeonly;
 extern ConfigVariableEnum<CoordinateSystem> gl_coordinate_system;
+extern ConfigVariableInt gl_texture_transfer_num_threads;
+extern ConfigVariableEnum<ThreadPriority> gl_texture_transfer_thread_priority;
 
 extern EXPCL_GL void CLP(init_classes)();
 
diff --git a/panda/src/gobj/preparedGraphicsObjects.cxx b/panda/src/gobj/preparedGraphicsObjects.cxx
index b76b0524765..3089537dd2c 100644
--- a/panda/src/gobj/preparedGraphicsObjects.cxx
+++ b/panda/src/gobj/preparedGraphicsObjects.cxx
@@ -1515,9 +1515,24 @@ begin_frame(GraphicsStateGuardianBase *gsg, Thread *current_thread) {
     Texture *tex = qti->first;
     TextureContext *tc = tex->prepare_now(this, gsg);
     if (tc != nullptr) {
-      gsg->update_texture(tc, true);
-      if (qti->second != nullptr) {
-        qti->second->set_result(tc);
+      if (tex->get_num_async_transfer_buffers() == 0) {
+        gsg->update_texture(tc, true);
+        if (qti->second != nullptr) {
+          qti->second->set_result(tc);
+        }
+      } else {
+        // Async update
+        CompletionToken token;
+        if (qti->second != nullptr) {
+          token = [tc, fut = std::move(qti->second)] (bool success) {
+            if (success) {
+              fut->set_result(tc);
+            } else {
+              fut->notify_removed();
+            }
+          };
+        }
+        gsg->update_texture(tc, false, std::move(token));
       }
     }
   }
diff --git a/panda/src/gobj/texture.I b/panda/src/gobj/texture.I
index 0d349f6653b..d304bfbfed4 100644
--- a/panda/src/gobj/texture.I
+++ b/panda/src/gobj/texture.I
@@ -2139,6 +2139,14 @@ rescale_texture() {
   return do_rescale_texture(cdata);
 }
 
+/**
+ * Returns the number previously passed to setup_async_transfer().
+ */
+INLINE int Texture::
+get_num_async_transfer_buffers() const {
+  return _num_async_transfer_buffers.load(std::memory_order_relaxed);
+}
+
 /**
  * Works like adjust_size, but also considers the texture class.  Movie
  * textures, for instance, always pad outwards, regardless of textures-
diff --git a/panda/src/gobj/texture.cxx b/panda/src/gobj/texture.cxx
index 2afe0f03205..b8b1f5bbf15 100644
--- a/panda/src/gobj/texture.cxx
+++ b/panda/src/gobj/texture.cxx
@@ -1570,6 +1570,27 @@ get_view_modified_pages(UpdateSeq since, int view, int n) const {
   return result;
 }
 
+/**
+ * Sets the number of buffers for asynchronous upload of texture data.  If this
+ * number is higher than 0, future texture uploads will occur in the background,
+ * up to the provided amount at a time.  The asynchronous upload will be
+ * triggered by calls to prepare() or when the texture comes into view and
+ * allow-incomplete-render is true.
+ *
+ * Each buffer is only large enough to contain a single view, so you may wish
+ * to create twice as many buffers if you want to update twice as many views.
+ *
+ * You can also pass the special value -1, which means to create as many
+ * buffers as is necessary for all asynchronous uploads to take place, and they
+ * will be deleted afterwards automatically.
+ *
+ * This setting will take effect immediately.
+ */
+void Texture::
+setup_async_transfer(int num_buffers) {
+  _num_async_transfer_buffers.store(num_buffers);
+}
+
 /**
  * Indicates that the texture should be enqueued to be prepared in the
  * indicated prepared_objects at the beginning of the next frame.  This will
@@ -5704,7 +5725,14 @@ do_modify_ram_image(CData *cdata) {
   } else {
     do_clear_ram_mipmap_images(cdata);
   }
-  return cdata->_ram_images[0]._image;
+  PTA_uchar data = cdata->_ram_images[0]._image;
+  if (data.get_node_ref_count() > 0) {
+    // Copy on write, if an upload thread is reading this now.
+    PTA_uchar new_data = PTA_uchar::empty_array(0);
+    new_data.v() = data.v();
+    data.swap(new_data);
+  }
+  return data;
 }
 
 /**
@@ -5779,7 +5807,15 @@ do_modify_ram_mipmap_image(CData *cdata, int n) {
       cdata->_ram_images[n]._image.empty()) {
     do_make_ram_mipmap_image(cdata, n);
   }
-  return cdata->_ram_images[n]._image;
+
+  PTA_uchar data = cdata->_ram_images[n]._image;
+  if (data.get_node_ref_count() > 0) {
+    // Copy on write, if an upload thread is reading this now.
+    PTA_uchar new_data = PTA_uchar::empty_array(0);
+    new_data.v() = data.v();
+    data.swap(new_data);
+  }
+  return data;
 }
 
 /**
diff --git a/panda/src/gobj/texture.h b/panda/src/gobj/texture.h
index f6ec5eb1f2a..4ba2b58169b 100644
--- a/panda/src/gobj/texture.h
+++ b/panda/src/gobj/texture.h
@@ -47,6 +47,7 @@
 #include "pfmFile.h"
 #include "asyncTask.h"
 #include "extension.h"
+#include "patomic.h"
 
 class TextureContext;
 class FactoryParams;
@@ -536,6 +537,8 @@ class EXPCL_PANDA_GOBJ Texture : public TypedWritableReferenceCount, public Nama
   MAKE_PROPERTY(auto_texture_scale, get_auto_texture_scale,
                                     set_auto_texture_scale);
 
+  void setup_async_transfer(int num_buffers);
+
   PT(AsyncFuture) prepare(PreparedGraphicsObjects *prepared_objects);
   bool is_prepared(PreparedGraphicsObjects *prepared_objects) const;
   bool was_image_modified(PreparedGraphicsObjects *prepared_objects) const;
@@ -628,6 +631,7 @@ class EXPCL_PANDA_GOBJ Texture : public TypedWritableReferenceCount, public Nama
 
 public:
   void texture_uploaded();
+  INLINE int get_num_async_transfer_buffers() const;
 
   virtual bool has_cull_callback() const;
   virtual bool cull_callback(CullTraverser *trav, const CullTraverserData &data) const;
@@ -1072,6 +1076,8 @@ class EXPCL_PANDA_GOBJ Texture : public TypedWritableReferenceCount, public Nama
   typedef pmap<PreparedGraphicsObjects *, TextureContext *> Contexts;
   Contexts _contexts;
 
+  patomic_signed_lock_free _num_async_transfer_buffers { 0 };
+
   // It is common, when using normal maps, specular maps, gloss maps, and
   // such, to use a file naming convention where the filenames of the special
   // maps are derived by concatenating a suffix to the name of the diffuse
diff --git a/panda/src/gsgbase/graphicsStateGuardianBase.h b/panda/src/gsgbase/graphicsStateGuardianBase.h
index 80d31f5c3c4..d49f144046f 100644
--- a/panda/src/gsgbase/graphicsStateGuardianBase.h
+++ b/panda/src/gsgbase/graphicsStateGuardianBase.h
@@ -22,6 +22,7 @@
 #include "lightMutex.h"
 #include "patomic.h"
 #include "small_vector.h"
+#include "completionToken.h"
 
 // A handful of forward references.
 
@@ -149,6 +150,7 @@ class EXPCL_PANDA_GSGBASE GraphicsStateGuardianBase : public TypedWritableRefere
 
   virtual TextureContext *prepare_texture(Texture *tex)=0;
   virtual bool update_texture(TextureContext *tc, bool force)=0;
+  virtual bool update_texture(TextureContext *tc, bool force, CompletionToken token)=0;
   virtual void release_texture(TextureContext *tc)=0;
   virtual void release_textures(const pvector<TextureContext *> &contexts)=0;
   virtual bool extract_texture_data(Texture *tex)=0;
diff --git a/panda/src/putil/CMakeLists.txt b/panda/src/putil/CMakeLists.txt
index ecc14b9843c..c99dfac3837 100644
--- a/panda/src/putil/CMakeLists.txt
+++ b/panda/src/putil/CMakeLists.txt
@@ -20,6 +20,9 @@ set(P3PUTIL_HEADERS
   clockObject.h clockObject.I
   collideMask.h
   colorSpace.h
+  completable.I completable.h
+  completionCounter.I completionCounter.h
+  completionToken.I completionToken.h
   copyOnWriteObject.h copyOnWriteObject.I
   copyOnWritePointer.h copyOnWritePointer.I
   compareTo.I compareTo.h
@@ -86,6 +89,7 @@ set(P3PUTIL_SOURCES
   callbackObject.cxx
   clockObject.cxx
   colorSpace.cxx
+  completionCounter.cxx
   copyOnWriteObject.cxx
   copyOnWritePointer.cxx
   config_putil.cxx configurable.cxx
diff --git a/panda/src/putil/completable.I b/panda/src/putil/completable.I
new file mode 100644
index 00000000000..96d140be33c
--- /dev/null
+++ b/panda/src/putil/completable.I
@@ -0,0 +1,75 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completable.I
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+#ifndef CPPPARSER
+/**
+ *
+ */
+template<class Callable>
+INLINE Completable::
+Completable(Callable callback) :
+  _data(new LambdaData<Callable>(std::move(callback), [](Data *data, bool do_run) {
+    LambdaData<Callable> *self = (LambdaData<Callable> *)data;
+    if (do_run) {
+      std::move(self->_lambda)();
+    }
+    delete self;
+  })) {
+}
+#endif
+
+/**
+ *
+ */
+INLINE Completable::
+Completable(Completable &&from) noexcept :
+  _data(from._data) {
+  from._data = nullptr;
+}
+
+/**
+ *
+ */
+INLINE Completable &Completable::
+operator =(Completable &&from) {
+  Data *data = _data;
+  _data = from._data;
+  from._data = nullptr;
+  if (data != nullptr) {
+    data->_function.load(std::memory_order_relaxed)(data, false);
+  }
+  return *this;
+}
+
+/**
+ *
+ */
+INLINE Completable::
+~Completable() {
+  Data *data = _data;
+  if (data != nullptr) {
+    data->_function.load(std::memory_order_relaxed)(data, false);
+  }
+}
+
+/**
+ *
+ */
+INLINE void Completable::
+operator ()() {
+  Data *data = _data;
+  _data = nullptr;
+  if (data != nullptr) {
+    data->_function.load(std::memory_order_relaxed)(data, true);
+  }
+}
diff --git a/panda/src/putil/completable.h b/panda/src/putil/completable.h
new file mode 100644
index 00000000000..9b0f6fdd129
--- /dev/null
+++ b/panda/src/putil/completable.h
@@ -0,0 +1,82 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completable.h
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+#ifndef COMPLETABLE_H
+#define COMPLETABLE_H
+
+#include "pandabase.h"
+#include "patomic.h"
+
+/**
+ * Stores a type-erased callable that is move-only.  May only be called once.
+ */
+class EXPCL_PANDA_PUTIL Completable {
+public:
+  constexpr Completable() = default;
+
+#ifndef CPPPARSER
+  template<class Callable>
+  INLINE Completable(Callable callback);
+#endif
+
+  INLINE Completable(const Completable &copy) = delete;
+  INLINE Completable(Completable &&from) noexcept;
+
+  INLINE Completable &operator =(const Completable &copy) = delete;
+  INLINE Completable &operator =(Completable &&from);
+
+  INLINE void operator ()();
+
+  INLINE ~Completable();
+
+protected:
+  // There are several design approaches here:
+  // 1. Optimize for no data block: do not require dynamic allocation of a data
+  //    block in the simple case where the callback data is only the size of a
+  //    single pointer.  Store two pointers, one function pointer and a data
+  //    pointer(-sized storage), directly on the class here.
+  // 2. Optimize for a data block: store the function pointer on the data block,
+  //    always requiring dynamic allocation.
+  //
+  // Right now I have opted for 2 because it allows the function pointer to be
+  // dynamically swapped (used in CompletionCounter), but this decision may
+  // change in the future.
+
+  struct Data;
+  typedef void CallbackFunction(Data *, bool);
+
+  struct Data {
+    patomic<CallbackFunction *> _function { nullptr };
+  };
+
+  template<typename Lambda>
+  struct LambdaData : public Data {
+    // Must unfortunately be defined inline, since this struct is protected.
+    LambdaData(Lambda lambda, CallbackFunction *function) :
+      _lambda(std::move(lambda)) {
+      _function = function;
+    }
+
+    Lambda _lambda;
+  };
+
+  Data *_data = nullptr;
+
+  friend class AsyncFuture;
+  friend class CompletionCounter;
+  friend class CompletionToken;
+};
+
+#include "completable.I"
+
+#endif
diff --git a/panda/src/putil/completionCounter.I b/panda/src/putil/completionCounter.I
new file mode 100644
index 00000000000..6d591433607
--- /dev/null
+++ b/panda/src/putil/completionCounter.I
@@ -0,0 +1,97 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completionCounter.I
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+/**
+ *
+ */
+INLINE CompletionCounter::
+~CompletionCounter() {
+  CounterData *data = _data;
+  if (data != nullptr) {
+    // then() is not called; we still need something that destructs the data
+    // when done.
+    auto prev_function = data->_function.exchange(&abandon_callback, std::memory_order_relaxed);
+    if (prev_function == nullptr) {
+      // Was already done.
+      delete data;
+    }
+  }
+}
+
+/**
+ * Returns a new token.  May not be called after then().
+ */
+INLINE CompletionToken CompletionCounter::
+make_token() {
+  CompletionToken token;
+  if (_data == nullptr) {
+    _data = new CounterData;
+    _data->_function = &initial_callback;
+  }
+  auto old_value = _data->_counter.fetch_add(1);
+  nassertr(old_value >= 0, token);
+  token._callback._data = _data;
+  return token;
+}
+
+/**
+ * Runs the given callback immediately upon completion.  If the counter is
+ * already done, runs it immediately.  This requires an rvalue because it
+ * consumes the counter, use std::move() if you don't have an rvalue.
+ *
+ * The callback will either be called immediately or directly when the last
+ * token calls complete(), however, it may also be called if a token is
+ * destroyed.  This may happen at unexpected times, such as when the lambda
+ * holding the token is destroyed prematurely.  In this case, however, the
+ * passed success argument will always be false.
+ */
+template<class Callable>
+INLINE void CompletionCounter::
+then(Callable callable) && {
+  // Replace the callback pointer with something that calls the given callable
+  // once the count reaches 0.
+  CounterData *data = _data;
+  nassertv(data != nullptr);
+  _data = nullptr;
+  if (data->_function.load(std::memory_order_acquire) == nullptr) {
+    // Already done.
+    callable((data->_counter.load(std::memory_order_relaxed) & ~0xffff) == 0);
+    delete data;
+    return;
+  }
+
+  static_assert(sizeof(Callable) <= sizeof(data->_storage),
+    "raise storage size in completionCounter.h or reduce lambda captures");
+
+  new (data->_storage) Callable(std::move(callable));
+
+  Completable::CallbackFunction *new_function =
+    [] (Completable::Data *data_ptr, bool success) {
+      CounterData *data = (CounterData *)data_ptr;
+      auto prev_count = data->_counter.fetch_add((success ? 0 : 0x10000) - 1, std::memory_order_release);
+      if ((short)(prev_count & 0xffff) > 1) {
+        return;
+      }
+
+      Callable *callable = (Callable *)data->_storage;
+      std::move(*callable)(success && (prev_count & ~0xffff) == 0);
+      callable->~Callable();
+      delete data;
+    };
+
+  auto prev_function = data->_function.exchange(new_function, std::memory_order_acq_rel);
+  if (UNLIKELY(prev_function == nullptr)) {
+    // Last token finished in the meantime.
+    new_function(data, (data->_counter.load(std::memory_order_relaxed) & ~0xffff) == 0);
+  }
+}
diff --git a/panda/src/putil/completionCounter.cxx b/panda/src/putil/completionCounter.cxx
new file mode 100644
index 00000000000..2540867e61f
--- /dev/null
+++ b/panda/src/putil/completionCounter.cxx
@@ -0,0 +1,52 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completionCounter.cxx
+ * @author rdb
+ * @date 2025-01-24
+ */
+
+#include "completionCounter.h"
+
+/**
+ * Called when a token is completed before then() is called.
+ */
+void CompletionCounter::
+initial_callback(Completable::Data *data_ptr, bool success) {
+  CounterData &data = *(CounterData *)data_ptr;
+  auto prev_count = data._counter.fetch_add((success ? 0 : 0x10000) - 1, std::memory_order_release);
+  if ((prev_count & 0xffff) == 1) {
+    // We're done early.
+    auto prev_callback = data._function.exchange(nullptr, std::memory_order_acq_rel);
+    nassertv(prev_callback != nullptr);
+
+    // Someone called then() in the meantime.  Call the new callback.  The
+    // refcount will drop below 0 when that's called but they are designed to
+    // handle that.
+    if (prev_callback != &initial_callback) {
+      prev_callback(data_ptr, success && (prev_count & ~0xffff) == 0);
+    }
+  }
+}
+
+/**
+ * Called when a token is completed after this object is destroyed without
+ * then() being called.
+ */
+void CompletionCounter::
+abandon_callback(Completable::Data *data_ptr, bool success) {
+  CounterData &data = *(CounterData *)data_ptr;
+  auto prev_count = data._counter.fetch_sub(1, std::memory_order_relaxed);
+  if ((prev_count & 0xffff) <= 1) {
+    // Done.
+    auto prev_callback = data._function.exchange(nullptr, std::memory_order_relaxed);
+    nassertv(prev_callback != nullptr);
+    nassertv(prev_callback == &abandon_callback);
+    delete &data;
+  }
+}
diff --git a/panda/src/putil/completionCounter.h b/panda/src/putil/completionCounter.h
new file mode 100644
index 00000000000..dbb0e2dcfb4
--- /dev/null
+++ b/panda/src/putil/completionCounter.h
@@ -0,0 +1,58 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completionCounter.h
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+#ifndef COMPLETIONCOUNTER_H
+#define COMPLETIONCOUNTER_H
+
+#include "pandabase.h"
+#include "completionToken.h"
+
+#include <cstddef>
+
+/**
+ * Shared counter that generates "completion tokens" incrementing a counter,
+ * which will decrement the counter once they are finished.  After the tokens
+ * are handed out, a callback may be registered using then(), which will be
+ * called as soon as the last token is done.
+ */
+class EXPCL_PANDA_PUTIL CompletionCounter {
+public:
+  constexpr CompletionCounter() = default;
+  CompletionCounter(const CompletionCounter &copy) = delete;
+
+  INLINE ~CompletionCounter();
+
+  INLINE CompletionToken make_token();
+
+  template<class Callable>
+  INLINE void then(Callable callable) &&;
+
+private:
+  static void initial_callback(Completable::Data *data, bool success);
+  static void abandon_callback(Completable::Data *data, bool success);
+
+protected:
+  struct CounterData : public Completable::Data {
+    // Least significant half is counter, most significant half is error count
+    patomic_signed_lock_free _counter { 0 };
+
+    // Just raise this if the static_assert fires (or limit the size of your
+    // lambda captures).
+    alignas(std::max_align_t) unsigned char _storage[64];
+  };
+  CounterData *_data = nullptr;
+};
+
+#include "completionCounter.I"
+
+#endif
diff --git a/panda/src/putil/completionToken.I b/panda/src/putil/completionToken.I
new file mode 100644
index 00000000000..aef06a7d4eb
--- /dev/null
+++ b/panda/src/putil/completionToken.I
@@ -0,0 +1,42 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completionToken.I
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+#ifndef CPPPARSER
+/**
+ * Creates a token that calls the given callback when it's done, passing it
+ * true on success and false on failure or abandonment.
+ */
+template<class Callable>
+INLINE CompletionToken::
+CompletionToken(Callable callback) {
+  // Main difference over a Completable is that this will always call the
+  // callback, even on failure, so that cleanup can be done.
+  _callback._data = new Completable::LambdaData<Callable>(std::move(callback), [](Completable::Data *data, bool success) {
+    Completable::LambdaData<Callable> *self = (Completable::LambdaData<Callable> *)data;
+    std::move(self->_lambda)(success);
+    delete self;
+  });
+}
+#endif
+
+/**
+ *
+ */
+INLINE void CompletionToken::
+complete(bool success) {
+  Completable::Data *data = _callback._data;
+  if (data != nullptr) {
+    _callback._data = nullptr;
+    data->_function.load(std::memory_order_relaxed)(data, success);
+  }
+}
diff --git a/panda/src/putil/completionToken.h b/panda/src/putil/completionToken.h
new file mode 100644
index 00000000000..b73f4fe376b
--- /dev/null
+++ b/panda/src/putil/completionToken.h
@@ -0,0 +1,56 @@
+/**
+ * PANDA 3D SOFTWARE
+ * Copyright (c) Carnegie Mellon University.  All rights reserved.
+ *
+ * All use of this software is subject to the terms of the revised BSD
+ * license.  You should have received a copy of this license along
+ * with this source code in a file named "LICENSE."
+ *
+ * @file completionToken.h
+ * @author rdb
+ * @date 2025-01-22
+ */
+
+#ifndef COMPLETIONTOKEN_H
+#define COMPLETIONTOKEN_H
+
+#include "pandabase.h"
+#include "pnotify.h"
+#include "completable.h"
+
+/**
+ * A completion token can be created from a callback, future or
+ * CompletionCounter and can be passed into an asynchronous operation in order
+ * to receive a signal when it is done.
+ *
+ * The asynchronous operation should call complete() on it when it is done,
+ * with a boolean value indicating success or failure.  If the token is
+ * destroyed prematurely, it is treated as if it called complete(false).
+ *
+ * This should be preferred over passing an AsyncFuture into a method since
+ * a CompletionToken provides both more flexibility in use (due to accepting
+ * an arbitrary callback) and more safety (since the RAII semantics guarantees
+ * that the callback is never silently dropped).
+ *
+ * The token may only be moved, not copied.
+ */
+class EXPCL_PANDA_PUTIL CompletionToken {
+public:
+  constexpr CompletionToken() = default;
+
+#ifndef CPPPARSER
+  template<class Callable>
+  INLINE CompletionToken(Callable callback);
+#endif
+
+  void complete(bool success);
+
+protected:
+  Completable _callback;
+
+  friend class CompletionCounter;
+};
+
+#include "completionToken.I"
+
+#endif
diff --git a/panda/src/putil/p3putil_composite1.cxx b/panda/src/putil/p3putil_composite1.cxx
index f78459eff71..c464d7708c9 100644
--- a/panda/src/putil/p3putil_composite1.cxx
+++ b/panda/src/putil/p3putil_composite1.cxx
@@ -17,6 +17,7 @@
 #include "callbackObject.cxx"
 #include "clockObject.cxx"
 #include "colorSpace.cxx"
+#include "completionCounter.cxx"
 #include "config_putil.cxx"
 #include "configurable.cxx"
 #include "copyOnWriteObject.cxx"