diff --git a/README.md b/README.md
index 19523e3..9e8ea15 100644
--- a/README.md
+++ b/README.md
@@ -78,10 +78,14 @@ python main.py --dataset ispd2015_fix --design_name mgc_fft_1 --load_from_raw Tr
 python main.py --dataset ispd2015_fix --run_all True --load_from_raw True --detail_placement True
 ```
 
-- To run Routability GP + DP flow for ISPD2015 dataset:
+- To run Routability GP + DP flow for ISPD2015/2018/2019 dataset:
 ```bash
-# run all the designs in ispd2015 with routability optimization
+# run all the designs with routability optimization
 python main.py --dataset ispd2015_fix --run_all True --load_from_raw True --detail_placement True --use_cell_inflate True
+
+python main.py --dataset ispd2018 --run_all True --load_from_raw True --detail_placement True --use_cell_inflate True
+
+python main.py --dataset ispd2019_no_fence --run_all True --load_from_raw True --detail_placement True --use_cell_inflate True
 ```
 
 **NOTE**: We default enable the deterministic mode. If you don't need determinism and want to run placement in an extremely fast mode, please try to set `--deterministic False` in the Python arguments.
diff --git a/cpp_to_py/common/db/Cell.cpp b/cpp_to_py/common/db/Cell.cpp
index 48e5b20..360cc9a 100644
--- a/cpp_to_py/common/db/Cell.cpp
+++ b/cpp_to_py/common/db/Cell.cpp
@@ -37,19 +37,8 @@ void Cell::ctype(CellType* t) {
 
 int Cell::lx() const { return _lx; }
 int Cell::ly() const { return _ly; }
-bool Cell::flipX() const { return _flipX; }
-bool Cell::flipY() const { return _flipY; }
 int Cell::orient() const {
-    if (!flipX() && !flipY()) {
-        return 0;  // N
-    } else if (flipX() && flipY()) {
-        return 2;  // S
-    } else if (flipX() && !flipY()) {
-        return 4;  // FN
-    } else if (!flipX() && flipY()) {
-        return 6;  // FS
-    }
-    return 0;
+    return _orient;
 }
 
 bool Cell::placed() const { return (lx() != INT_MIN) && (ly() != INT_MIN); }
@@ -70,38 +59,7 @@ void Cell::place(int x, int y, int orient) {
     }
     _lx = x;
     _ly = y;
-    switch (orient) {
-        case 0:
-            _flipX = false;
-            _flipY = false;
-            break;
-        case 2:
-            _flipX = true;
-            _flipY = true;
-            break;
-        case 4:
-            _flipX = true;
-            _flipY = false;
-            break;
-        case 6:
-            _flipX = false;
-            _flipY = true;
-            break;
-        default:
-            _flipX = false;
-            _flipY = false;
-            break;
-    }
-}
-
-void Cell::place(int x, int y, bool flipX, bool flipY) {
-    if (_fixed) {
-        logger.warning("moving fixed cell %s to (%d,%d)", _name.c_str(), x, y);
-    }
-    _lx = x;
-    _ly = y;
-    _flipX = flipX;
-    _flipY = flipY;
+    _orient = orient;
 }
 
 void Cell::unplace() {
@@ -109,7 +67,7 @@ void Cell::unplace() {
         logger.warning("unplace fixed cell %s", _name.c_str());
     }
     _lx = _ly = INT_MIN;
-    _flipX = _flipY = false;
+    _orient = -1;
 }
 
 /***** Cell Type *****/
diff --git a/cpp_to_py/common/db/Cell.h b/cpp_to_py/common/db/Cell.h
index bcd51fb..9045b8c 100644
--- a/cpp_to_py/common/db/Cell.h
+++ b/cpp_to_py/common/db/Cell.h
@@ -14,19 +14,19 @@ class CellType {
     char _topPower = 'x';
     vector<Geometry> _obs;
 
-    // _nonRegularRects.size() > 0 implies that this cell is a fixed cell and 
-    // its shape is a polygon. Each rectangle is also appended into 
+    // _nonRegularRects.size() > 0 implies that this cell is a fixed cell and
+    // its shape is a polygon. Each rectangle is also appended into
     // Databased.placeBlockages during parsing the polygon-shape cell.
     // NOTE: We only support this feature for ICCAD/DAC 2012 benchmarks.
-    //     When processing GPDatabase, we will set this kind of cells' width and 
+    //     When processing GPDatabase, we will set this kind of cells' width and
     //     height as 0 and use placeement blockages to represent their shapes.
-    vector<Rectangle> _nonRegularRects; 
+    vector<Rectangle> _nonRegularRects;
 
     int _libcell = -1;
 
 public:
     std::string name = "";
-    char cls = 'x';
+    std::string cls = "x";
     bool stdcell = false;
     int width = 0;
     int height = 0;
@@ -84,8 +84,7 @@ class Cell {
 
     int _lx = INT_MIN;
     int _ly = INT_MIN;
-    bool _flipX = false;
-    bool _flipY = false;
+    int _orient = -1;  // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
 
 public:
     bool highlighted = false;
@@ -107,8 +106,6 @@ class Cell {
     int hy() const { return ly() + height(); }
     int cx() const { return lx() + width() / 2; }
     int cy() const { return ly() + height() / 2; }
-    bool flipX() const;
-    bool flipY() const;
     int orient() const;
     int width() const { return _type->width + _spaceL + _spaceR; }
     int height() const { return _type->height + _spaceB + _spaceT; }
@@ -119,7 +116,6 @@ class Cell {
     bool placed() const;
     void place(int x, int y);
     void place(int x, int y, int orient);
-    void place(int x, int y, bool flipX, bool flipY);
     void unplace();
     unsigned numPins() const { return _pins.size(); }
 
diff --git a/cpp_to_py/common/db/Database.cpp b/cpp_to_py/common/db/Database.cpp
index aa3df96..0782569 100644
--- a/cpp_to_py/common/db/Database.cpp
+++ b/cpp_to_py/common/db/Database.cpp
@@ -490,9 +490,11 @@ void Database::SetupRows() {
     // verify row flipping conflict
     bool flipCheckPass = true;
     std::vector<char> flip(nSitesY, 0);
+    std::vector<int> orients(nSitesY, -1);
     for (Row* row : rows) {
         char isFlip = (row->flip() ? 1 : 2);
         int y = (row->y() - coreLY) / siteH;
+        orients[y] = row->orient();  // TODO: how to handle multiple ROW definitions on same y?
         if (flip[y] == 0) {
             flip[y] = isFlip;
         } else if (flip[y] != isFlip) {
@@ -525,6 +527,7 @@ void Database::SetupRows() {
         rows[y]->xNum(nSitesX);
         rows[y]->yNum(1);
         rows[y]->flip(flip[y] == 1);
+        rows[y]->orient(orients[y]);
     }
 
     // set row power-rail
@@ -722,10 +725,11 @@ Row* Database::addRow(const string& name,
                       const int y,
                       const unsigned xNum,
                       const unsigned yNum,
+                      const int orient,
                       const bool flip,
                       const unsigned xStep,
                       const unsigned yStep) {
-    Row* newrow = new Row(name, macro, x, y, xNum, yNum, flip, xStep, yStep);
+    Row* newrow = new Row(name, macro, x, y, xNum, yNum, orient, flip, xStep, yStep);
     rows.push_back(newrow);
     return newrow;
 }
diff --git a/cpp_to_py/common/db/Database.h b/cpp_to_py/common/db/Database.h
index 82cf3f6..6392058 100644
--- a/cpp_to_py/common/db/Database.h
+++ b/cpp_to_py/common/db/Database.h
@@ -154,6 +154,7 @@ class Database {
                 const int y,
                 const unsigned xNum = 0,
                 const unsigned yNum = 0,
+                const int orient = 0,
                 const bool flip = false,
                 const unsigned xStep = 0,
                 const unsigned yStep = 0);
diff --git a/cpp_to_py/common/db/Pin.h b/cpp_to_py/common/db/Pin.h
index 29b8029..dfc4643 100644
--- a/cpp_to_py/common/db/Pin.h
+++ b/cpp_to_py/common/db/Pin.h
@@ -50,7 +50,7 @@ class IOPin {
     string name = "";
     int x = INT_MIN;
     int y = INT_MIN;
-    int _orient = 0;  // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE
+    int _orient = -1;  // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
     PinType* type;
     Pin* pin;
     int gpdb_id = -1;
diff --git a/cpp_to_py/common/db/Row.h b/cpp_to_py/common/db/Row.h
index 10f8f51..36e51d9 100644
--- a/cpp_to_py/common/db/Row.h
+++ b/cpp_to_py/common/db/Row.h
@@ -11,6 +11,7 @@ class Row {
     int _y = 0;
     unsigned _xNum = 0;
     unsigned _yNum = 0;
+    int _orient = 0;   // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
     bool _flip = false;
     unsigned _xStep = 0;
     unsigned _yStep = 0;
@@ -20,13 +21,14 @@ class Row {
 public:
     std::vector<RowSegment> segments;
 
-    Row(const string& name, const string& macro, const int x, const int y, const unsigned xNum = 0, const unsigned yNum = 0, const bool flip = false, const unsigned xStep = 0, const unsigned yStep = 0)
+    Row(const string& name, const string& macro, const int x, const int y, const unsigned xNum = 0, const unsigned yNum = 0, const int orient = 0, const bool flip = false, const unsigned xStep = 0, const unsigned yStep = 0)
         : _name(name)
         , _macro(macro)
         , _x(x)
         , _y(y)
         , _xNum(xNum)
         , _yNum(yNum)
+        , _orient(orient)
         , _flip(flip)
         , _xStep(xStep)
         , _yStep(yStep) {}
@@ -46,6 +48,7 @@ class Row {
     int y() const { return _y; }
     unsigned xNum() const { return _xNum; }
     unsigned yNum() const { return _yNum; }
+    int orient() const { return _orient; }
     bool flip() const { return _flip; }
     unsigned xStep() const { return _xStep; }
     unsigned yStep() const { return _yStep; }
@@ -56,6 +59,7 @@ class Row {
     void y(const int value) { _y = value; }
     void xNum(const unsigned value) { _xNum = value; }
     void yNum(const unsigned value) { _yNum = value; }
+    void orient(const int value) { _orient = value; }
     void flip(const bool value) { _flip = value; }
     void xStep(const unsigned value) { _xStep = value; }
     void yStep(const unsigned value) { _yStep = value; }
diff --git a/cpp_to_py/common/io/file_bkshf_db.cpp b/cpp_to_py/common/io/file_bkshf_db.cpp
index 0f34faa..ce38393 100644
--- a/cpp_to_py/common/io/file_bkshf_db.cpp
+++ b/cpp_to_py/common/io/file_bkshf_db.cpp
@@ -417,6 +417,7 @@ bool Database::readBSAux(const std::string& auxFile, const std::string& plFile)
         row->xNum(bsData.rowSites[i]);
         row->yNum(1);
         row->flip((i % 2) == 1);
+        row->orient((i % 2) * 6);  // 0:N or 6:FS
         this->dieLX = std::min(this->dieLX, row->x());
         this->dieLY = std::min(this->dieLY, row->y());
         this->dieHX = std::max(this->dieHX, row->x() + (int)row->width());
@@ -575,7 +576,8 @@ bool Database::readBSAux(const std::string& auxFile, const std::string& plFile)
         } else {
             string celltypename(bsData.typeName[typeID]);
             Cell* cell = this->addCell(bsData.cellName[i], this->getCellType(celltypename));
-            cell->place(bsData.cellX[i], bsData.cellY[i], false, false);
+            // In Bookshelf, we don't need to consider the cell orient, set it as -1
+            cell->place(bsData.cellX[i], bsData.cellY[i], -1);
             // cout<<cell->x<<","<<cell->y<<endl;
             cell->fixed((bsData.cellFixed[i] == (char)1));
         }
diff --git a/cpp_to_py/common/io/file_lefdef_db.cpp b/cpp_to_py/common/io/file_lefdef_db.cpp
index 23d293c..df2a99c 100644
--- a/cpp_to_py/common/io/file_lefdef_db.cpp
+++ b/cpp_to_py/common/io/file_lefdef_db.cpp
@@ -22,6 +22,7 @@ bool isFlipX(int orient) {
     }
     return false;
 }
+
 bool isFlipY(int orient) {
     switch (orient) {
         case 0:
@@ -36,7 +37,7 @@ bool isFlipY(int orient) {
     return false;
 }
 
-string getOrient(bool flipX, bool flipY) {
+string getRowOrient(bool flipX, bool flipY) {
     if (flipX) {
         if (flipY) {
             return "S";
@@ -52,6 +53,32 @@ string getOrient(bool flipX, bool flipY) {
     }
 }
 
+string getOrient(int orient) {
+    // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
+    switch (orient) {
+        case 0:
+            return "N";
+        case 1:
+            return "W";
+        case 2:
+            return "S";
+        case 3:
+            return "E";
+        case 4:
+            return "FN";
+        case 5:
+            return "FW";
+        case 6:
+            return "FS";
+        case 7:
+            return "FE";
+        case -1:
+            return "NONE";
+        default:
+            return "N";
+    }
+}
+
 #define DIR_UP 1
 #define DIR_DOWN 2
 #define DIR_LEFT 4
@@ -360,16 +387,16 @@ bool Database::writeComponents(ofstream& ofs) {
         oss << "   - " << cell->name() << " " << cell->ctype()->name << endl;
         // ofs << "   - " << cell->name() << " " << cell->ctype()->name << endl;
         if (cell->fixed()) {
-            oss << "      + FIXED ( " << cell->lx() << " " << cell->ly() << " ) "
-                << getOrient(cell->flipX(), cell->flipY()) << " ;" << endl;
+            oss << "      + FIXED ( " << cell->lx() << " " << cell->ly() << " ) " << getOrient(cell->orient()) << " ;"
+                << endl;
             // ofs << "      + FIXED ( " << cell->lx() << " " << cell->ly() << " ) "
-            //    << getOrient(cell->flipX(), cell->flipY())
+            //    << getOrient(cell->orient())
             //    << " ;" << endl;
         } else if (cell->placed()) {
-            oss << "      + PLACED ( " << cell->lx() << " " << cell->ly() << " ) "
-                << getOrient(cell->flipX(), cell->flipY()) << " ;" << endl;
+            oss << "      + PLACED ( " << cell->lx() << " " << cell->ly() << " ) " << getOrient(cell->orient()) << " ;"
+                << endl;
             // ofs << "      + PLACED ( " << cell->lx() << " " << cell->ly() << " ) "
-            //    << getOrient(cell->flipX(), cell->flipY())
+            //    << getOrient(cell->orient())
             //    << " ;" << endl;
         } else {
             oss << "      + UNPLACED ;" << endl;
@@ -461,7 +488,7 @@ bool Database::writeDEF(const string& file) {
 
     for (Row* row : rows) {
         ofs << "ROW " << row->name() << ' ' << row->macro() << ' ' << row->x() << ' ' << row->y() << ' ';
-        ofs << getOrient(false, row->flip());
+        ofs << getRowOrient(false, row->flip());
         ofs << " DO " << row->xNum() << " BY " << row->yNum() << " STEP " << row->xStep() << ' ' << row->yStep()
             << " ;\n";
     }
@@ -1158,19 +1185,18 @@ int readLefMacro(lefrCallbackType_e c, lefiMacro* macro, lefiUserData ud) {
     celltype->height = round(macro->sizeY() * convertFactor);
 
     if (macro->lefiMacro::hasClass()) {
-        char clsname[64] = {0};
-        strcpy(clsname, macro->macroClass());
-        if (!strcmp(clsname, "CORE")) {
-            celltype->cls = 'c';
+        std::string clsname(macro->macroClass());
+        if (clsname == "CORE") {
+            celltype->cls = clsname;
             celltype->stdcell = true;
-        } else if (!strcmp(clsname, "BLOCK")) {
-            celltype->cls = 'b';
+        } else if (clsname == "BLOCK") {
+            celltype->cls = clsname;
         } else {
-            celltype->cls = clsname[0];
-            logger.warning("Class type is not defined: %s", clsname);
+            celltype->cls = clsname;
+            logger.warning("Class type is not defined: %s", celltype->cls.c_str());
         }
     } else {
-        celltype->cls = 'c';
+        celltype->cls = "CORE";  // default value
     }
 
     if (macro->lefiMacro::hasOrigin()) {
@@ -1273,6 +1299,7 @@ int readDefRow(defrCallbackType_e c, defiRow* drow, defiUserData ud) {
                  drow->y(),
                  drow->xNum(),
                  drow->yNum(),
+                 drow->orient(),
                  isFlipY(drow->orient()),
                  drow->xStep(),
                  drow->yStep());
@@ -1442,27 +1469,36 @@ int readDefComponentStart(defrCallbackType_e c, int num, defiUserData ud) {
 
 int readDefComponent(defrCallbackType_e c, defiComponent* co, defiUserData ud) {
     Database* db = (Database*)ud;
-
-    Cell* cell = db->addCell(co->id(), db->getCellType(co->name()));
+    CellType* celltype = db->getCellType(co->name());
+    Cell* cell = db->addCell(co->id(), celltype);
 
     if (co->isUnplaced()) {
         cell->fixed(false);
         cell->unplace();
     } else if (co->isPlaced()) {
-        cell->place(co->placementX(), co->placementY(), isFlipX(co->placementOrient()), isFlipY(co->placementOrient()));
-        cell->fixed(false);
+        cell->place(co->placementX(), co->placementY(), co->placementOrient());
+        if (celltype->cls == "CORE") {
+            cell->fixed(false);
+        } else {
+            // Set all non-CORE cells as fixed cells
+            cell->fixed(true);
+        }
         if (co->placementOrient() % 2 == 1) {
-            // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE
-            logger.warning(
-                "Cell [%s]'s placementOrient [%d] is not supported.", cell->name().c_str(), co->placementOrient());
+            // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
+            logger.warning("Cell [%s]'s placementOrient [%s] is not supported, CLASS: %s.",
+                           cell->name().c_str(),
+                           getOrient(co->placementOrient()).c_str(),
+                           celltype->cls.c_str());
         }
     } else if (co->isFixed()) {
-        cell->place(co->placementX(), co->placementY(), isFlipX(co->placementOrient()), isFlipY(co->placementOrient()));
+        cell->place(co->placementX(), co->placementY(), co->placementOrient());
         cell->fixed(true);
         if (co->placementOrient() % 2 == 1) {
-            // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE
-            logger.warning(
-                "Cell [%s]'s placementOrient [%d] is not supported.", cell->name().c_str(), co->placementOrient());
+            // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
+            logger.warning("Fixed Cell [%s]'s placementOrient [%s] is not supported, CLASS: %s.",
+                           cell->name().c_str(),
+                           getOrient(co->placementOrient()).c_str(),
+                           celltype->cls.c_str());
         }
     }
     return 0;
diff --git a/cpp_to_py/common/utils/log.h b/cpp_to_py/common/utils/log.h
index 25d8285..4982e08 100644
--- a/cpp_to_py/common/utils/log.h
+++ b/cpp_to_py/common/utils/log.h
@@ -63,6 +63,7 @@ class PrintfLogger {
     static constexpr bool write_log = false;
     FILE* f;
     bool tmp_verbose_parser_log = false;
+    int _global_log_level = LOG_INFO;
 
 public:
     // void setup_logger(argparse::ArgumentParser parser);
@@ -84,12 +85,16 @@ class PrintfLogger {
         verbose_parser_log = tmp_verbose_parser_log;
     }
 
+    void set_global_log_level(int value) {
+        _global_log_level = value;
+    }
+
     template <typename... Args>
     void log(int log_level, const char* format, Args&&... args) {
         if (!verbose_parser_log) {
             return;
         }
-        if (log_level >= GLOBAL_LOG_LEVEL) {
+        if (log_level >= _global_log_level) {
             std::string curr_log = tstamp.get_time_stamp();
             if (log_level > LOG_INFO) {
                 curr_log += log_level_ANSI_color(log_level);
@@ -111,7 +116,7 @@ class PrintfLogger {
         if (!verbose_parser_log) {
             return;
         }
-        if (log_level >= GLOBAL_LOG_LEVEL) {
+        if (log_level >= _global_log_level) {
             std::string curr_log = tstamp.get_time_stamp();
             if (log_level > LOG_INFO) {
                 curr_log += log_level_ANSI_color(log_level);
diff --git a/cpp_to_py/common/utils/log_level.h b/cpp_to_py/common/utils/log_level.h
index d327bc1..89aa2bb 100644
--- a/cpp_to_py/common/utils/log_level.h
+++ b/cpp_to_py/common/utils/log_level.h
@@ -14,7 +14,6 @@ inline constexpr int LOG_WARN = 4;       // 4
 inline constexpr int LOG_ERROR = 5;      // 5
 inline constexpr int LOG_FATAL = 6;      // 6
 inline constexpr int LOG_OK = 7;         // 7
-inline int GLOBAL_LOG_LEVEL = LOG_INFO;  // change verbose level in Setting.h
 
 }  // namespace utils::log_level
 
diff --git a/cpp_to_py/gpudp/PyBindCppMain.cpp b/cpp_to_py/gpudp/PyBindCppMain.cpp
index c13760e..0162fc7 100644
--- a/cpp_to_py/gpudp/PyBindCppMain.cpp
+++ b/cpp_to_py/gpudp/PyBindCppMain.cpp
@@ -26,6 +26,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                             float,
                             int,
                             int,
+                            int,
                             float,
                             float>())
         .def("check", &dp::DPTorchRawDB::check)
@@ -57,6 +58,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
              float xh_,
              float yl_,
              float yh_,
+             int num_conn_movable_nodes_,
              int num_movable_nodes_,
              int num_nodes_,
              float site_width_,
@@ -79,6 +81,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
                                                         xh_,
                                                         yl_,
                                                         yh_,
+                                                        num_conn_movable_nodes_,
                                                         num_movable_nodes_,
                                                         num_nodes_,
                                                         site_width_,
@@ -90,9 +93,10 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("abacusLegalization", [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr, int num_bins_x, int num_bins_y) {
         return dp::abacusLegalization(*at_db_ptr, num_bins_x, num_bins_y);
     });
-    m.def("greedyLegalization", [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr, int num_bins_x, int num_bins_y) {
-        return dp::greedyLegalization(*at_db_ptr, num_bins_x, num_bins_y);
-    });
+    m.def("greedyLegalization",
+          [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr, int num_bins_x, int num_bins_y, bool legalize_filler) {
+              return dp::greedyLegalization(*at_db_ptr, num_bins_x, num_bins_y, legalize_filler);
+          });
     m.def("kReorder",
           [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr, int num_bins_x, int num_bins_y, int K, int max_iters) {
               return dp::kReorder(*at_db_ptr, num_bins_x, num_bins_y, K, max_iters);
@@ -114,6 +118,8 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
     m.def("legalityCheck", [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr, float scale_factor) {
         return dp::legalityCheck(*at_db_ptr, scale_factor);
     });
+    m.def("fillerLegalization",
+          [](std::shared_ptr<dp::DPTorchRawDB> at_db_ptr) { return dp::fillerLegalization(*at_db_ptr); });
 }
 
 }  // namespace Xplace
diff --git a/cpp_to_py/gpudp/check/legality_check.cpp b/cpp_to_py/gpudp/check/legality_check.cpp
index 3cc8542..4cd20a5 100644
--- a/cpp_to_py/gpudp/check/legality_check.cpp
+++ b/cpp_to_py/gpudp/check/legality_check.cpp
@@ -4,6 +4,8 @@
 
 namespace dp {
 
+float floatRound(float a, float prec) { return std::round(a / prec) * prec; }
+
 bool boundaryCheck(const float* x,
                    const float* y,
                    const float* node_size_x,
@@ -94,14 +96,15 @@ bool fenceRegionCheck(const float* x,
                       const int* flat_region_boxes_start,
                       const int* node2fence_region_map,
                       int num_movable_nodes,
-                      int num_regions) {
+                      int num_regions,
+                      float scale_factor) {
     bool legal_flag = true;
     // check fence regions
     for (int i = 0; i < num_movable_nodes; ++i) {
         float node_xl = x[i];
-        float node_yl = y[i];
+        float node_yl = floatRound(y[i], scale_factor);
         float node_xh = node_xl + node_size_x[i];
-        float node_yh = node_yl + node_size_y[i];
+        float node_yh = floatRound(node_yl + node_size_y[i], scale_factor);
 
         int region_id = node2fence_region_map[i];
         if (region_id < num_regions) {
@@ -112,10 +115,12 @@ bool fenceRegionCheck(const float* x,
             // otherwise, preprocessing is required
             for (int box_id = box_bgn; box_id < box_end; ++box_id) {
                 int box_offset = box_id * 4;
+
                 float box_xl = flat_region_boxes[box_offset];
                 float box_xh = flat_region_boxes[box_offset + 1];
-                float box_yl = flat_region_boxes[box_offset + 2];
-                float box_yh = flat_region_boxes[box_offset + 3];
+
+                float box_yl = floatRound(flat_region_boxes[box_offset + 2], scale_factor);
+                float box_yh = floatRound(flat_region_boxes[box_offset + 3], scale_factor);
 
                 float dx = std::max(std::min(node_xh, box_xh) - std::max(node_xl, box_xl), (float)0);
                 float dy = std::max(std::min(node_yh, box_yh) - std::max(node_yl, box_yl), (float)0);
@@ -333,8 +338,19 @@ bool legalityCheckKernelCPU(const float* x,
         std::cerr << "site alignment check error!" << std::endl;
     }
 
-    if (!overlapCheck(
-            x, y, node_size_x, node_size_y, site_width, row_height, scale_factor, xl, yl, xh, yh, num_nodes, num_movable_nodes)) {
+    if (!overlapCheck(x,
+                      y,
+                      node_size_x,
+                      node_size_y,
+                      site_width,
+                      row_height,
+                      scale_factor,
+                      xl,
+                      yl,
+                      xh,
+                      yh,
+                      num_nodes,
+                      num_movable_nodes)) {
         legal_flag = false;
         std::cerr << "overlap check error!" << std::endl;
     }
@@ -348,7 +364,8 @@ bool legalityCheckKernelCPU(const float* x,
                           flat_region_boxes_start,
                           node2fence_region_map,
                           num_movable_nodes,
-                          num_regions)) {
+                          num_regions,
+                          scale_factor)) {
         legal_flag = false;
         std::cerr << "fence region check error!" << std::endl;
     }
diff --git a/cpp_to_py/gpudp/db/dp_torch.cpp b/cpp_to_py/gpudp/db/dp_torch.cpp
index 87742a9..b6e2602 100644
--- a/cpp_to_py/gpudp/db/dp_torch.cpp
+++ b/cpp_to_py/gpudp/db/dp_torch.cpp
@@ -23,6 +23,7 @@ DPTorchRawDB::DPTorchRawDB(torch::Tensor node_lpos_init_,
                            float xh_,
                            float yl_,
                            float yh_,
+                           int num_conn_movable_nodes_,
                            int num_movable_nodes_,
                            int num_nodes_,
                            float site_width_,
@@ -45,6 +46,7 @@ DPTorchRawDB::DPTorchRawDB(torch::Tensor node_lpos_init_,
     num_nets = hyperedge_list_end_.size(0);
     num_regions = region_boxes_end_.size(0);
     num_movable_nodes = num_movable_nodes_;
+    num_conn_movable_nodes = num_conn_movable_nodes_;
 
     flat_node2pin_start_map =
         torch::cat({torch::zeros({1}, torch::dtype(torch::kInt32).device(torch::Device(node_size.device()))),
@@ -92,12 +94,20 @@ bool DPTorchRawDB::check(float scale_factor) {
 }
 
 void DPTorchRawDB::scale(float scale_factor, bool use_round) {
-    pin_rel_lpos.mul_(scale_factor);
     if (use_round) {
+        pin_rel_lpos.mul_(scale_factor);
         node_size.mul_(scale_factor).round_();
         node_lpos_init.mul_(scale_factor).round_();
+
+        node_size_x.mul_(scale_factor).round_();
+        node_size_y.mul_(scale_factor).round_();
+        init_x.mul_(scale_factor).round_();
+        init_y.mul_(scale_factor).round_();
+        pin_offset_x.mul_(scale_factor).round_();
+        pin_offset_y.mul_(scale_factor).round_();
         x.mul_(scale_factor).round_();
         y.mul_(scale_factor).round_();
+
         flat_region_boxes.mul_(scale_factor).round_();
         site_width = round(site_width * scale_factor);
         row_height = round(row_height * scale_factor);
@@ -106,17 +116,27 @@ void DPTorchRawDB::scale(float scale_factor, bool use_round) {
         yl = round(yl * scale_factor);
         yh = round(yh * scale_factor);
     } else {
-        node_size.mul_(scale_factor);
-        node_lpos_init.mul_(scale_factor);
-        x.mul_(scale_factor);
-        y.mul_(scale_factor);
-        flat_region_boxes.mul_(scale_factor);
-        site_width = site_width * scale_factor;
-        row_height = row_height * scale_factor;
-        xl = xl * scale_factor;
-        xh = xh * scale_factor;
-        yl = yl * scale_factor;
-        yh = yh * scale_factor;
+        float inv_scale_factor = std::round(1.0 / scale_factor);
+        pin_rel_lpos.div_(inv_scale_factor);
+        node_size.div_(inv_scale_factor);
+        node_lpos_init.div_(inv_scale_factor);
+
+        node_size_x.div_(inv_scale_factor);
+        node_size_y.div_(inv_scale_factor);
+        init_x.div_(inv_scale_factor);
+        init_y.div_(inv_scale_factor);
+        pin_offset_x.div_(inv_scale_factor);
+        pin_offset_y.div_(inv_scale_factor);
+        x.div_(inv_scale_factor);
+        y.div_(inv_scale_factor);
+
+        flat_region_boxes.div_(inv_scale_factor);
+        site_width = site_width / inv_scale_factor;
+        row_height = row_height / inv_scale_factor;
+        xl = xl / inv_scale_factor;
+        xh = xh / inv_scale_factor;
+        yl = yl / inv_scale_factor;
+        yh = yh / inv_scale_factor;
     }
 }
 
diff --git a/cpp_to_py/gpudp/db/dp_torch.h b/cpp_to_py/gpudp/db/dp_torch.h
index e526b31..18551b4 100644
--- a/cpp_to_py/gpudp/db/dp_torch.h
+++ b/cpp_to_py/gpudp/db/dp_torch.h
@@ -25,6 +25,7 @@ class DPTorchRawDB {
                  float xh_,
                  float yl_,
                  float yh_,
+                 int num_conn_movable_nodes_,
                  int num_movable_nodes_,
                  int num_nodes_,
                  float site_width_,
@@ -88,6 +89,7 @@ class DPTorchRawDB {
     int num_nets;
     int num_nodes;
     int num_movable_nodes;
+    int num_conn_movable_nodes;
     int num_regions;
 
     float site_width;
@@ -100,7 +102,8 @@ class DPTorchRawDB {
 // Legalization
 bool macroLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y);
 void abacusLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y);
-void greedyLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y);
+void greedyLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y, bool legalize_filler);
+void fillerLegalization(DPTorchRawDB& at_db);
 
 // Detailed Placement
 void kReorder(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y, int K, int max_iters);
diff --git a/cpp_to_py/gpudp/dp/independent_set_matching_cuda_kernel.cu b/cpp_to_py/gpudp/dp/independent_set_matching_cuda_kernel.cu
index 2c8d65c..d41c5eb 100644
--- a/cpp_to_py/gpudp/dp/independent_set_matching_cuda_kernel.cu
+++ b/cpp_to_py/gpudp/dp/independent_set_matching_cuda_kernel.cu
@@ -162,6 +162,10 @@ void construct_spaces(DetailedPlaceData& db,
                     int right_node_id = row2nodes[j + 1];
                     right_bound = min(right_bound, host_x[right_node_id]);
                 }
+                // space.xh = right_bound;
+                // NOTE: some designs' fixed nodes are not placed on site (e.g. mgc_edit_dist_a),
+                //       fix these cases by aligning them to site
+                // FIXME: need regression
                 space.xh = std::floor(right_bound);
                 space.xh = floorDiv(space.xh - db.xl, db.site_width) * db.site_width + db.xl; 
             }
diff --git a/cpp_to_py/gpudp/dp/ism/apply_solution.cuh b/cpp_to_py/gpudp/dp/ism/apply_solution.cuh
index 245dc29..f96f91a 100644
--- a/cpp_to_py/gpudp/dp/ism/apply_solution.cuh
+++ b/cpp_to_py/gpudp/dp/ism/apply_solution.cuh
@@ -178,8 +178,30 @@ __global__ void move_nodes_kernel(DetailedPlaceDBType db, IndependentSetMatching
                     if (j != sol_k) {
                         atomicAdd(state.device_num_moved, 1);
                         auto const& orig_space = orig_spaces[sol_k];
+                        // printf(
+                        //     "apply cost matrix %d, j %d, node_id %d, sol_k %d, pos_id %d, space (%g, %g), "
+                        //     "orig_space (%g, %g) xy (%g, %g) orig_xy (%g, %g)\n",
+                        //     i,
+                        //     j,
+                        //     node_id,
+                        //     sol_k,
+                        //     independent_set[sol_k],
+                        //     space.xl,
+                        //     space.xh,
+                        //     orig_space.xl,
+                        //     orig_space.xh,
+                        //     x,
+                        //     y,
+                        //     orig_x[sol_k],
+                        //     orig_y[sol_k]);
                         x = orig_x[sol_k];
                         bool ret = adjust_pos(x, node_width, orig_space);
+                        if (!ret) {
+                            printf("ERROR: ism adjust_pos, node_width: %g, orig_space(%g, %g)\n",
+                                   node_width,
+                                   orig_space.xl,
+                                   orig_space.xh);
+                        }
                         assert(ret);
                         y = orig_y[sol_k];
                         space = orig_space;
diff --git a/cpp_to_py/gpudp/lg/filler_legalize.cpp b/cpp_to_py/gpudp/lg/filler_legalize.cpp
new file mode 100644
index 0000000..d57f0e7
--- /dev/null
+++ b/cpp_to_py/gpudp/lg/filler_legalize.cpp
@@ -0,0 +1,264 @@
+#include "gpudp/lg/legalization_db.h"
+
+namespace dp {
+
+template <typename T>
+struct FillerBlank {
+    T xl;
+    T yl;
+    T xh;
+    T yh;
+    int bucket_list_level;
+
+    FillerBlank() {}
+    FillerBlank(T xl_, T yl_, T xh_, T yh_) : xl(xl_), yl(yl_), xh(xh_), yh(yh_) {}
+    void intersect(const FillerBlank& rhs) {
+        xl = std::max(xl, rhs.xl);
+        xh = std::min(xh, rhs.xh);
+        yl = std::max(yl, rhs.yl);
+        yh = std::min(yh, rhs.yh);
+    }
+};
+
+void fixCells2Bins(const LegalizationData& db,
+                   const float* x,
+                   const float* y,
+                   const float* node_size_x,
+                   const float* node_size_y,
+                   float bin_size_x,
+                   float bin_size_y,
+                   float xl,
+                   float yl,
+                   float xh,
+                   float yh,
+                   int num_bins_x,
+                   int num_bins_y,
+                   int num_nodes,
+                   int num_movable_nodes,
+                   int num_conn_movable_nodes,
+                   std::vector<std::vector<int>>& bin_cells) {
+    // do not handle large macros
+    // one cell cannot be distributed to one bin
+    for (int i = 0; i < num_nodes; i += 1) {
+        if (i < num_conn_movable_nodes || i >= num_movable_nodes) {
+            int bin_id_x = (x[i] + node_size_x[i] / 2 - xl) / bin_size_x;
+            int bin_id_y = (y[i] + node_size_y[i] / 2 - yl) / bin_size_y;
+
+            bin_id_x = std::min(std::max(bin_id_x, 0), num_bins_x - 1);
+            bin_id_y = std::min(std::max(bin_id_y, 0), num_bins_y - 1);
+
+            int bin_id = bin_id_x * num_bins_y + bin_id_y;
+
+            bin_cells[bin_id].push_back(i);
+        }
+    }
+    // sort bin cells
+    for (int i = 0; i < num_bins_x * num_bins_y; i += 1) {
+        std::vector<int>& cells = bin_cells.at(i);
+        std::sort(cells.begin(), cells.end(), [&](int node_id1, int node_id2) {
+            float x1 = x[node_id1];
+            float x2 = x[node_id2];
+            return x1 < x2 || (x1 == x2 && node_id1 < node_id2);
+        });
+    }
+}
+
+void reduceBlanks(const float* x,
+                  const float* y,
+                  const float* node_size_x,
+                  const float* node_size_y,
+                  const std::vector<std::vector<int>>& bin_cells,
+                  float bin_size_x,
+                  float bin_size_y,
+                  float xl,
+                  float yl,
+                  float xh,
+                  float yh,
+                  float site_width,
+                  float row_height,
+                  int num_bins_x,
+                  int num_bins_y,
+                  std::vector<std::vector<FillerBlank<float>>>& bin_blanks) {
+    for (int i = 0; i < num_bins_x * num_bins_y; i += 1) {
+        int bin_id_x = i / num_bins_y;
+        int bin_id_y = i - bin_id_x * num_bins_y;
+        int bin_id = bin_id_x * num_bins_y + bin_id_y;
+
+        float bin_xl = xl + bin_id_x * bin_size_x;
+        float bin_xh = std::min(bin_xl + bin_size_x, xh);
+        float bin_yl = yl + bin_id_y * bin_size_y;
+        float bin_yh = std::min(bin_yl + bin_size_y, yh);
+
+        FillerBlank<float> blank;
+        blank.xl = floorDiv((bin_xl - xl), site_width) * site_width + xl;  // align blanks to sites
+        blank.xh = floorDiv((bin_xh - xl), site_width) * site_width + xl;  // align blanks to sites
+        blank.yl = bin_yl;
+        blank.yh = bin_yl + row_height;
+
+        bin_blanks.at(bin_id).push_back(blank);
+
+        const std::vector<int>& cells = bin_cells.at(i);
+        std::vector<FillerBlank<float>>& blanks = bin_blanks.at(bin_id);
+
+        for (unsigned int ci = 0; ci < cells.size(); ++ci) {
+            int node_id = cells.at(ci);
+            float node_xl = x[node_id];
+            float node_yl = y[node_id];
+            float node_xh = node_xl + node_size_x[node_id];
+            float node_yh = node_yl + node_size_y[node_id];
+
+            if (blanks.empty()) {
+                break;
+            }
+            FillerBlank<float>& blank = blanks.back();
+
+            if (node_xh > blank.xl && node_xl < blank.xh)  // overlap
+            {
+                if (node_xl == blank.xl && node_xh == blank.xh)  // remove
+                {
+                    blanks.pop_back();
+                }
+                if (node_xl == blank.xl && node_xh < blank.xh)  // reduce
+                {
+                    blank.xl = node_xh;
+                }
+                if (node_xl > blank.xl && node_xh == blank.xh)  // reduce
+                {
+                    blank.xh = node_xl;
+                }
+                if (node_xl > blank.xl && node_xh < blank.xh)  // split
+                {
+                    FillerBlank<float> new_blank(node_xh, blank.yl, blank.xh, blank.yh);
+                    blank.xh = node_xl;
+                    blanks.push_back(new_blank);
+                }
+            }
+        }
+    }
+
+    // // print blanks
+    // for (int i = 0; i < num_bins_x * num_bins_y; i += 1) {
+    //     const std::vector<FillerBlank<float>>& blanks = bin_blanks.at(i);
+    //     for (unsigned int j = 0; j < blanks.size(); ++j) {
+    //         const FillerBlank<float>& blank = blanks.at(j);
+    //         logger.info(
+    //             "bin %d: blank %d: xl = %g, xh = %g, yl = %g, yh = %g", i, j, blank.xl, blank.xh, blank.yl, blank.yh);
+    //     }
+    // }
+}
+
+void fillerLegalization(DPTorchRawDB& at_db) {
+    LegalizationData db(at_db);
+
+    int num_blanks_x = 1;
+    int num_bins_x = 1;
+
+    // bin dimension in y direction for blanks is different from that for cells
+    int num_blanks_y = floorDiv((db.yh - db.yl), db.row_height);
+    int num_bins_y = num_blanks_y;
+    logger.info("%s num_blanks_y = %d", "Standard cell legalization", num_blanks_y);
+
+    // adjust bin sizes
+    float bin_size_x = (db.xh - db.xl) / static_cast<float>(num_bins_x);
+    float bin_size_y = db.row_height;
+
+    // allocate bin blanks
+    std::vector<std::vector<FillerBlank<float>>> bin_blanks(num_blanks_x * num_blanks_y);
+    std::vector<std::vector<FillerBlank<float>>> bin_blanks_copy(num_blanks_x * num_blanks_y);
+    std::vector<std::vector<int>> bin_cells(num_bins_x * num_bins_y);
+
+    // distribute cells to bins
+    fixCells2Bins(db,
+                  db.x,
+                  db.y,
+                  db.node_size_x,
+                  db.node_size_y,
+                  bin_size_x,
+                  db.row_height,
+                  db.xl,
+                  db.yl,
+                  db.xh,
+                  db.yh,
+                  num_bins_x,
+                  num_bins_y,
+                  db.num_nodes,
+                  db.num_movable_nodes,
+                  db.num_conn_movable_nodes,
+                  bin_cells);
+
+    // distribute blanks to bins
+    reduceBlanks(db.x,
+                 db.y,
+                 db.node_size_x,
+                 db.node_size_y,
+                 bin_cells,
+                 bin_size_x,
+                 db.row_height,
+                 db.xl,
+                 db.yl,
+                 db.xh,
+                 db.yh,
+                 db.site_width,
+                 db.row_height,
+                 num_bins_x,
+                 num_bins_y,
+                 bin_blanks);
+
+    // sort all blanks and create blank bucket list
+    int maxDegree = 0;
+    robin_hood::unordered_map<int, vector<FillerBlank<float>>> blank_bucket_list;
+    for (int i = 0; i < num_blanks_x * num_blanks_y; i += 1) {
+        std::vector<FillerBlank<float>>& blanks = bin_blanks.at(i);
+        for (unsigned int j = 0; j < blanks.size(); ++j) {
+            FillerBlank<float>& blank = blanks.at(j);
+            blank.bucket_list_level = roundDiv(blank.xh - blank.xl, db.site_width);
+            maxDegree = std::max(maxDegree, blank.bucket_list_level);
+            blank_bucket_list[blank.bucket_list_level].push_back(blank);
+        }
+    }
+    logger.info("%s maxDegree = %d", "Blanks", maxDegree);
+
+    // sorted filler cell id
+    vector<int> fillers_to_blank(db.num_movable_nodes - db.num_conn_movable_nodes);
+    for (int i = 0; i < db.num_movable_nodes - db.num_conn_movable_nodes; i += 1) {
+        fillers_to_blank[i] = db.num_conn_movable_nodes + i;
+    }
+    std::sort(fillers_to_blank.begin(), fillers_to_blank.end(), [&](int node_id1, int node_id2) {
+        float size_x1 = db.node_size_x[node_id1];
+        float size_x2 = db.node_size_x[node_id2];
+        return size_x1 < size_x2 || (size_x1 == size_x2 && node_id1 < node_id2);
+    });
+    // for (int i = 0; i < db.num_movable_nodes - db.num_conn_movable_nodes; i += 1) {
+    //     logger.info("filler %d: size_x = %g", filler_id[i], db.node_size_x[filler_id[i]]);
+    // }
+
+    // put filler cells into blanks
+    while (!fillers_to_blank.empty()) {
+        int filler_id = fillers_to_blank.back();
+        int filler_size_int = roundDiv(db.node_size_x[filler_id], db.site_width);
+        FillerBlank<float>& max_blank = blank_bucket_list[maxDegree].back();
+        assert(max_blank.bucket_list_level >= filler_size_int);
+        // update filler position
+        db.x[filler_id] = max_blank.xl;
+        db.y[filler_id] = max_blank.yl;
+        max_blank.xl += db.node_size_x[filler_id];
+        max_blank.bucket_list_level -= filler_size_int;
+        blank_bucket_list[max_blank.bucket_list_level].push_back(max_blank);
+        blank_bucket_list[maxDegree].pop_back();
+        while (blank_bucket_list[maxDegree].empty()) {
+            maxDegree -= 1;
+        }
+
+        fillers_to_blank.pop_back();
+
+        // logger.info("put filler %d into blank %d: blank_xl = %g, blank_xh = %g",
+        //             filler_id,
+        //             max_blank.bucket_list_level,
+        //             max_blank.xl,
+        //             max_blank.xh);
+
+        // logger.info("num_remaining_fillers = %d, max degree = %d", fillers_to_blank.size(), maxDegree);
+    }
+}
+
+}  // namespace dp
\ No newline at end of file
diff --git a/cpp_to_py/gpudp/lg/greedy_legalize.cpp b/cpp_to_py/gpudp/lg/greedy_legalize.cpp
index 70025a1..8ed8e7d 100644
--- a/cpp_to_py/gpudp/lg/greedy_legalize.cpp
+++ b/cpp_to_py/gpudp/lg/greedy_legalize.cpp
@@ -45,10 +45,16 @@ void distributeCells2Bins(const LegalizationData& db,
                           int num_bins_y,
                           int num_nodes,
                           int num_movable_nodes,
-                          std::vector<std::vector<int>>& bin_cells) {
+                          int num_conn_movable_nodes,
+                          std::vector<std::vector<int>>& bin_cells,
+                          bool legalize_filler) {
     // do not handle large macros
     // one cell cannot be distributed to one bin
-    for (int i = 0; i < num_movable_nodes; i += 1) {
+    int num_legalized_nodes = num_conn_movable_nodes;
+    if (legalize_filler) {
+        num_legalized_nodes = num_movable_nodes;
+    }
+    for (int i = 0; i < num_legalized_nodes; i += 1) {
         if (!db.is_dummy_fixed(i)) {
             int bin_id_x = (x[i] + node_size_x[i] / 2 - xl) / bin_size_x;
             int bin_id_y = (y[i] + node_size_y[i] / 2 - yl) / bin_size_y;
@@ -527,7 +533,7 @@ void minNodeSize(const std::vector<std::vector<int>>& bin_cells,
     }
 }
 
-void greedyLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y) {
+void greedyLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y, bool legalize_filler) {
     LegalizationData db(at_db);
     db.set_num_bins(num_bins_x, num_bins_y);
     // first from right to left
@@ -566,7 +572,9 @@ void greedyLegalization(DPTorchRawDB& at_db, int num_bins_x, int num_bins_y) {
                              num_bins_y,
                              db.num_nodes,
                              db.num_movable_nodes,
-                             bin_cells);
+                             db.num_conn_movable_nodes,
+                             bin_cells, 
+                             legalize_filler);
 
         // allocate bin fixed cells
         std::vector<std::vector<int>> bin_fixed_cells(num_bins_x * num_bins_y);
diff --git a/cpp_to_py/gpudp/lg/legalization_db.h b/cpp_to_py/gpudp/lg/legalization_db.h
index bf47796..6339f5e 100644
--- a/cpp_to_py/gpudp/lg/legalization_db.h
+++ b/cpp_to_py/gpudp/lg/legalization_db.h
@@ -82,6 +82,7 @@ class LegalizationData {
           num_threads(at_db.num_threads),
           num_nodes(at_db.num_nodes),
           num_movable_nodes(at_db.num_movable_nodes),
+          num_conn_movable_nodes(at_db.num_conn_movable_nodes),
           num_nets(at_db.num_nets),
           num_pins(at_db.num_pins),
           num_regions(at_db.num_regions) {}
@@ -125,6 +126,7 @@ class LegalizationData {
     float site_width;
 
     int num_nets;
+    int num_conn_movable_nodes;
     int num_movable_nodes;
     int num_nodes;
     int num_pins;
diff --git a/cpp_to_py/gpugr/db/GRDatabase.cpp b/cpp_to_py/gpugr/db/GRDatabase.cpp
index 3efc552..79054e7 100644
--- a/cpp_to_py/gpugr/db/GRDatabase.cpp
+++ b/cpp_to_py/gpugr/db/GRDatabase.cpp
@@ -254,6 +254,7 @@ void GRDatabase::updateUsageLength() {
 }
 
 void GRDatabase::addFixObs() {
+    logger.info("Marking fixed cell obs...");
     fixObs.clear();
     // 1) add IOPins
     for (auto iopin : rawdb.iopins) {
@@ -309,6 +310,7 @@ void GRDatabase::addFixObs() {
 }
 
 void GRDatabase::addMovObs() {
+    logger.info("Marking movable cell obs...");
     movObs.clear();
     for (auto cell : rawdb.cells) {
         if (!cell->fixed()) {
@@ -320,16 +322,7 @@ void GRDatabase::addMovObs() {
 
 void GRDatabase::addCellObs(std::vector<RectOnLayer>& allObs, db::Cell* cell) {
     db::CellType* ctype = cell->ctype();
-    int cellOrient = 0;
-    if (!cell->flipX() && !cell->flipY()) {
-        cellOrient = 0;  // N
-    } else if (cell->flipX() && cell->flipY()) {
-        cellOrient = 2;  // S
-    } else if (cell->flipX() && !cell->flipY()) {
-        cellOrient = 4;  // FN
-    } else if (!cell->flipX() && cell->flipY()) {
-        cellOrient = 6;  // FS
-    }
+    int cellOrient = cell->orient();
     int dx = ctype->originX() + cell->lx();
     int dy = ctype->originY() + cell->ly();
     // Macro Obs
@@ -386,7 +379,7 @@ void GRDatabase::addCellObs(std::vector<RectOnLayer>& allObs, db::Cell* cell) {
 
 tuple<int, int, int, int> GRDatabase::getOrientOffset(int orient, int lx, int ly, int hx, int hy) {
     tuple<int, int, int, int> offset;  // lx, ly, hx, hy
-    // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE
+    // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
     switch (orient) {
         case 0:  // N
             offset = {lx, ly, hx, hy};
@@ -507,6 +500,19 @@ void GRDatabase::markObs(std::vector<RectOnLayer>& allObs,
             utils::BoxT<int> obsBox(
                 curObs.lx - margin.x, curObs.ly - margin.y, curObs.hx + margin.x, curObs.hy + margin.y);
 
+            if (obsBox.IsValid()) {
+                if (obsBox.hx() <= gridlines[0][0] || obsBox.hy() <= gridlines[1][0] ||
+                    obsBox.lx() >= gridlines[0][gridlines[0].size() - 1] ||
+                    obsBox.ly() >= gridlines[1][gridlines[1].size() - 1]) {
+                    logger.verbose("ignore obs that is outside gridgraph, obsBox: %d %d %d %d",
+                                   obsBox.lx(),
+                                   obsBox.hx(),
+                                   obsBox.ly(),
+                                   obsBox.hy());
+                    continue;
+                }
+            }
+
             int xmin =
                 std::upper_bound(gridlines[0].begin(), gridlines[0].end(), obsBox.lx()) - gridlines[0].begin() - 1;
             int xmax =
@@ -520,7 +526,15 @@ void GRDatabase::markObs(std::vector<RectOnLayer>& allObs,
             xmax = std::min(xmax, xSize - 1);
             ymax = std::min(ymax, ySize - 1);
             if (xmin > xmax || ymin > ymax) {
-                logger.error("continue obs %d %d %d %d", xmin, xmax, ymin, ymax);
+                logger.error("continue, obs: %d %d %d %d, obsBox: %d %d %d %d",
+                             xmin,
+                             xmax,
+                             ymin,
+                             ymax,
+                             obsBox.lx(),
+                             obsBox.hx(),
+                             obsBox.ly(),
+                             obsBox.hy());
                 continue;
             }
             utils::BoxT<int> grBox(xmin, ymin, xmax, ymax);
@@ -745,7 +759,8 @@ void GRDatabase::setupGrNets() {
                     continue;
                 }
                 std::set<std::tuple<int, int, int>> vis;
-                for (auto& e : pin_shapes) {
+                for (int shapeIdx = 0; shapeIdx < pin_shapes.size(); shapeIdx++) {
+                    auto& e = pin_shapes[shapeIdx];
                     int xmin =
                         std::upper_bound(gridlines[0].begin(), gridlines[0].end(), e.lx) - gridlines[0].begin() - 1;
                     int xmax =
@@ -761,7 +776,37 @@ void GRDatabase::setupGrNets() {
                     xmax = std::min(std::max(xmax, 0), xSize - 1);
                     ymax = std::min(std::max(ymax, 0), ySize - 1);
                     if (xmin > xmax || ymin > ymax) {
-                        logger.error("continue pin %d %d %d %d", xmin, xmax, ymin, ymax);
+                        std::string instName = "";
+                        std::string instType = "";
+                        if (net_pin->iopin != nullptr) {
+                            db::IOPin* iopin = net_pin->iopin;
+                            instName = iopin->name;
+                            instType = iopin->type->name();
+                        } else if (net_pin->cell != nullptr) {
+                            db::Cell* cell = net_pin->cell;
+                            instName = cell->name();
+                            instType = cell->ctype()->name;
+                        }
+                        // NOTE: some benchmarks have strange definition of pin shapes (lx ly hx hy).
+                        // For example, in ispd18_test9, one of ADDFHX2 CI shapes is "RECT 2.59 0.40 3.67 0.36".
+                        logger.error(
+                            "continue netId: %d netName: %s net_pinId: %d | instName: %s instType: %s pinName: %s "
+                            "pinShapeId: %d | grid: %d %d %d %d | coord: %d %d %d %d",
+                            netId,
+                            rawdbNet->name.c_str(),
+                            pinIdx,
+                            instName.c_str(),
+                            instType.c_str(),
+                            net_pin->type->name().c_str(),
+                            shapeIdx,
+                            xmin,
+                            xmax,
+                            ymin,
+                            ymax,
+                            e.lx,
+                            e.hx,
+                            e.ly,
+                            e.hy);
                         continue;
                     }
                     for (int x = xmin; x <= xmax; x++) {
diff --git a/cpp_to_py/gpugr/gr/GPURouter.cu b/cpp_to_py/gpugr/gr/GPURouter.cu
index 66a69a3..5ba1c04 100644
--- a/cpp_to_py/gpugr/gr/GPURouter.cu
+++ b/cpp_to_py/gpugr/gr/GPURouter.cu
@@ -659,6 +659,9 @@ void GPURouter::route(vector<GrNet> &nets, int iter) {
         printf("%d\n", test.check_all(13, 128, 0));
         exit(0);*/
         // const int LEN = 10;
+        if (std::max(X, Y) >= 2000) {
+            logger.warning("Extremely large gridgraph may cause error during batch generation.");
+        }
         if (vis.size() == 0) {
             vis.resize(2000, std::vector<short>(2000, 0));
             visLL.resize(2000, std::vector<short>(2000, 0));
diff --git a/cpp_to_py/io_parser/PyBindCppMain.cpp b/cpp_to_py/io_parser/PyBindCppMain.cpp
index 468fe7a..08f1930 100644
--- a/cpp_to_py/io_parser/PyBindCppMain.cpp
+++ b/cpp_to_py/io_parser/PyBindCppMain.cpp
@@ -66,6 +66,41 @@ bool loadParams(const py::dict& kwargs) {
         utils::verbose_parser_log = false;
     }
 
+    // global logging level, default is LOG_INFO
+    //   need to enable "verbose_parser_log"
+    if (kwargs.contains("global_log_level")) {
+        int log_level = kwargs["global_log_level"].cast<int>();
+        switch (log_level) {
+            case 0:
+                logger.set_global_log_level(utils::log_level::LOG_DEBUG);
+                break;
+            case 1:
+                logger.set_global_log_level(utils::log_level::LOG_VERBOSE);
+                break;
+            case 2:
+                logger.set_global_log_level(utils::log_level::LOG_INFO);
+                break;
+            case 3:
+                logger.set_global_log_level(utils::log_level::LOG_NOTICE);
+                break;
+            case 4:
+                logger.set_global_log_level(utils::log_level::LOG_WARN);
+                break;
+            case 5:
+                logger.set_global_log_level(utils::log_level::LOG_ERROR);
+                break;
+            case 6:
+                logger.set_global_log_level(utils::log_level::LOG_FATAL);
+                break;
+            case 7:
+                logger.set_global_log_level(utils::log_level::LOG_OK);
+                break;
+            default:
+                logger.set_global_log_level(utils::log_level::LOG_INFO);
+                break;
+        }
+    }
+
     if (kwargs.contains("num_threads")) {
         db::setting.numThreads = kwargs["num_threads"].cast<int>();
     }
diff --git a/cpp_to_py/io_parser/gp/GPDatabase.cpp b/cpp_to_py/io_parser/gp/GPDatabase.cpp
index 4717d88..634d0b2 100644
--- a/cpp_to_py/io_parser/gp/GPDatabase.cpp
+++ b/cpp_to_py/io_parser/gp/GPDatabase.cpp
@@ -305,6 +305,209 @@ void GPDatabase::setupCheckVar() {
     // }
 }
 
+std::string orient2name(orient_type orient) {
+    switch (orient) {
+        case 0:
+            return "N";
+        case 1:
+            return "W";
+        case 2:
+            return "S";
+        case 3:
+            return "E";
+        case 4:
+            return "FN";
+        case 5:
+            return "FW";
+        case 6:
+            return "FS";
+        case 7:
+            return "FE";
+        case -1:
+            return "NONE";
+        default:
+            return "N";
+    }
+}
+
+orient_type hflip_orient(orient_type orient) {
+    // N => FN; W => FW; S => FS; E => FE
+    // FN => N; FW => W; FS => S; FE => E
+    if (orient == -1) {
+        return orient;
+    } else if (orient < 4) {
+        return orient + 4;
+    } else {
+        return orient - 4;
+    }
+}
+
+orient_type rotate_180_orient(orient_type orient) {
+    //  N => S;   W => E;   S => N;   E => W
+    // FN => FS; FW => FE; FS => FN; FE => FW
+    if (orient == -1) {
+        return orient;
+    } else if (orient == 0 || orient == 1 || orient == 4 || orient == 5) {
+        return orient + 2;
+    } else {
+        return orient - 2;
+    }
+}
+
+orient_type vflip_orient(orient_type orient) { return hflip_orient(rotate_180_orient(orient)); }
+
+
+void GPDatabase::transferOrient() {
+    // Transfer all gpdb nodes to "N" direction for global placement
+    // NOTE: we don't change rawdb
+    logger.info("Updating gpdb node orient...");
+
+    // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
+    auto getOrientDegreeFlip = [](orient_type orient) -> std::pair<int, int> {
+        // return value: {degree, flip}
+        if (orient == 0) {  // N
+            return {0, 0};
+        } else if (orient == 1) {  // W
+            return {90, 0};
+        } else if (orient == 2) {  // S
+            return {180, 0};
+        } else if (orient == 3) {  // E
+            return {270, 0};
+        } else if (orient == 4) {  // FN
+            return {0, 1};
+        } else if (orient == 5) {  // FW
+            return {90, 1};
+        } else if (orient == 6) {  // FS
+            return {180, 1};
+        } else if (orient == 7) {  // FE
+            return {270, 1};
+        } else {  // assume NONE is N
+            return {0, 0};
+        }
+    };
+
+    auto getRotatedSizes = [](int rotDegree, coord_type width, coord_type height) -> std::pair<coord_type, coord_type> {
+        if (rotDegree == 0 || rotDegree == 180) {
+            return {width, height};
+        } else if (rotDegree == 90 || rotDegree == 270) {
+            return {height, width};
+        } else {
+            logger.warning("Unknown rotation degree %d, regarded as 0", rotDegree);
+            return {width, height};
+        }
+    };
+
+    auto getRotatedPinInfo = [](int rotDegree,
+                                coord_type srcNodeWidth,
+                                coord_type srcNodeHeight,
+                                coord_type srcPinWidth,
+                                coord_type srcPinHeight,
+                                coord_type srcPinRelLx,
+                                coord_type srcPinRelLy) {
+        coord_type dstPinWidth = srcPinWidth;
+        coord_type dstPinHeight = srcPinHeight;
+        coord_type dstPinRelLx = std::numeric_limits<coord_type>::max();
+        coord_type dstPinRelLy = std::numeric_limits<coord_type>::max();
+        switch (rotDegree) {
+            default:
+                logger.warning("Unknown rotation degree %d, regarded as 0", rotDegree);
+            case 0:
+                dstPinRelLx = srcPinRelLx;
+                dstPinRelLy = srcPinRelLy;
+                break;
+            case 180:
+                dstPinRelLx = srcNodeWidth - srcPinRelLx - srcPinWidth;
+                dstPinRelLy = srcNodeHeight - srcPinRelLy - srcPinHeight;
+                break;
+            case 90:
+                dstPinRelLx = srcNodeHeight - srcPinRelLy - srcPinHeight;
+                dstPinRelLy = srcPinRelLx;
+                std::swap(dstPinWidth, dstPinHeight);
+                break;
+            case 270:
+                dstPinRelLx = srcPinRelLy;
+                dstPinRelLy = srcNodeWidth - srcPinRelLx - srcPinWidth;
+                std::swap(dstPinWidth, dstPinHeight);
+                break;
+        }
+
+        return std::make_tuple(dstPinWidth, dstPinHeight, dstPinRelLx, dstPinRelLy);
+    };
+
+    auto getFlipYPinRelPos =
+        [](coord_type srcNodeWidth, coord_type srcNodeHeight, coord_type srcPinRelLx, coord_type srcPinRelLy) {
+            // assume the src here are after rotation
+            coord_type dstPinRelLx = srcNodeWidth - srcPinRelLx;
+            coord_type dstPinRelLy = srcPinRelLy;
+
+            return std::make_pair(dstPinRelLx, dstPinRelLy);
+        };
+
+    // create a vector to restore statistics
+    int numOrientTypes = 9;  // 0:N, 1:W, 2:S, 3:E, 4:FN, 5:FW, 6:FS, 7:FE, -1:NONE
+    std::vector<int> orient2cnt(numOrientTypes, 0);
+
+    for (auto& node : nodes) {
+        orient_type srcOrient = node.getOrient();
+        if (srcOrient != 0 && srcOrient != -1) {
+            // srcOrient is not "N" or "NONE"
+            orient2cnt[srcOrient]++;
+            auto [srcDegree, srcFlip] = getOrientDegreeFlip(srcOrient);
+            auto [dstDegree, dstFlip] = getOrientDegreeFlip(0);  // dst: N
+
+            // compute rotation degree and flipping Y
+            int rotDegree = (dstDegree - srcDegree + 360) % 360;
+            bool flipY = (dstFlip != srcFlip);
+
+            auto [dstNodeWidth, dstNodeHeight] = getRotatedSizes(rotDegree, node.getWidth(), node.getHeight());
+
+            for (auto& pin_id : node.pins()) {
+                auto& pin = pins[pin_id];
+                auto [dstPinWidth, dstPinHeight, dstPinRelLx, dstPinRelLy] = getRotatedPinInfo(rotDegree,
+                                                                                               node.getWidth(),
+                                                                                               node.getHeight(),
+                                                                                               pin.getWidth(),
+                                                                                               pin.getHeight(),
+                                                                                               pin.getRelLx(),
+                                                                                               pin.getRelLy());
+                pin.setWidth(dstPinWidth);
+                pin.setHeight(dstPinHeight);
+                pin.setRelLx(dstPinRelLx);
+                pin.setRelLy(dstPinRelLy);
+            }
+
+            node.setWidth(dstNodeWidth);
+            node.setHeight(dstNodeHeight);
+
+            if (flipY) {
+                for (auto& pin_id : node.pins()) {
+                    auto& pin = pins[pin_id];
+                    auto [dstPinRelLx, dstPinRelLy] =
+                        getFlipYPinRelPos(node.getWidth(), node.getHeight(), pin.getRelLx(), pin.getRelLy());
+                    pin.setRelLx(dstPinRelLx);
+                    pin.setRelLy(dstPinRelLy);
+                }
+            }
+        }
+    }
+
+    logger.info("=== Transfer Node Orient Statistics ===");
+    for (int i = 0; i < orient2cnt.size(); i++) {
+        int count = orient2cnt[i];
+        orient_type orient;
+        if (i == 8) {
+            orient = -1;  // NONE
+        } else {
+            orient = i;
+        }
+        std::string orientName = orient2name(orient);
+        if (orient != 0 && orient != -1) {
+            logger.info("  %s => N: %d nodes in gpdb", orientName.c_str(), count);
+        }
+    }
+    logger.info("=======================================");
+}
+
 bool GPDatabase::setup() {
     if (db::setting.random_place) {
         setup_random_place();
@@ -316,6 +519,7 @@ bool GPDatabase::setup() {
     setupNets();
     setupIndexMap();
     setupCheckVar();
+    transferOrient();
     logger.info("Finish initializing global placement database");
     return true;
 }
@@ -600,6 +804,7 @@ std::vector<torch::Tensor> GPDatabase::getSnetInfoTensor() {
 
 void GPDatabase::applyOneNodeOrient(int node_id) {
     auto& node = nodes[node_id];
+
     int rowId;
     int numRows = database.rows.size();
     if (node.getLy() <= database.coreLY) {
@@ -611,10 +816,14 @@ void GPDatabase::applyOneNodeOrient(int node_id) {
         rowId = std::max(std::min(rowId, numRows - 1), 0);
     }
     auto row = database.rows[rowId];
-    if (row->flip()) {
-        node.setOrient(6);  // FS
+    if (node.getOrient() == -1) {
+        node.setOrient(row->orient());
     } else {
-        node.setOrient(0);  // N
+        if (row->orient() == vflip_orient(node.getOrient())) {
+            node.setOrient(row->orient());
+        } else if (row->orient() == hflip_orient(vflip_orient(node.getOrient()))) {
+            node.setOrient(vflip_orient(node.getOrient()));
+        }
     }
 }
 
diff --git a/cpp_to_py/io_parser/gp/GPDatabase.h b/cpp_to_py/io_parser/gp/GPDatabase.h
index 6b3cac5..e7ad338 100644
--- a/cpp_to_py/io_parser/gp/GPDatabase.h
+++ b/cpp_to_py/io_parser/gp/GPDatabase.h
@@ -186,6 +186,7 @@ class GPDatabase {
     void setupNets();
     void setupIndexMap();
     void setupCheckVar();
+    void transferOrient();
     bool setup();
     bool reset();
 
diff --git a/data/README.md b/data/README.md
index 3d60c92..5709dd2 100644
--- a/data/README.md
+++ b/data/README.md
@@ -1,4 +1,10 @@
 The following script will automatically download `ispd2005`, `ispd2015`, and `iccad2019` benchmarks in `./data/raw`. It also preprocesses `ispd2015` benchmark to fix some errors when routing them by Innovus®.
 ```bash
 ./download_data.sh
-```
\ No newline at end of file
+```
+
+# Note of Fixing ISPD 2015
+We provide a python scirpt `fix_ispd2015_route.py` to fix some errors in `ispd2015` benchmark. Thus, Innovus now can detailedly routed them.
+
+## Limitations
+**removeDefSNetVias**: Due to numerous DRVs caused by SNet Vias (spacing) after nanoroute routing, we have enabled `removeDefSNetVias` to remove these vias and address the above issue temporarily. It is likely that these vias are oversized, directly violating the spacing rule. While this adjustment has no significant impact on placement, it does result in open SNets. We sincerely encourage and appreciate contributions towards resolving this issue. Your contribution is highly valued and appreciated.
\ No newline at end of file
diff --git a/data/download_data.sh b/data/download_data.sh
index ad57262..46ec9e7 100755
--- a/data/download_data.sh
+++ b/data/download_data.sh
@@ -21,7 +21,29 @@ tar xvzf iccad2019.tar.gz
 rm -rf iccad2019.tar.gz
 mv iccad2019/ raw/
 
+echo "=== Downloading ispd2018 ==="
+mkdir raw/ispd2018
+for i in {1..10}
+do
+    wget --no-check-certificate https://www.ispd.cc/contests/18/ispd18_test$i.tgz
+    tar xvzf ispd18_test$i.tgz
+    rm -rf ispd18_test$i.tgz
+    mv ispd18_test$i/ raw/ispd2018/
+done
+
+echo "=== Downloading ispd2019 ==="
+mkdir raw/ispd2019
+for i in {1..10}
+do
+    wget --no-check-certificate https://www.ispd.cc/contests/19/benchmarks/ispd19_test$i.tgz
+    tar xvzf ispd19_test$i.tgz
+    rm -rf ispd19_test$i.tgz
+    mv ispd19_test$i/ raw/ispd2019/
+done
+python remove_fence_in_ispd19_test5.py
+
 # echo "=== (Optional) Converting raw design to torch data ==="
 # python convert_design_to_torch_data.py --dataset ispd2005
 # python convert_design_to_torch_data.py --dataset ispd2015_fix
-# python convert_design_to_torch_data.py --dataset iccad2019
\ No newline at end of file
+# python convert_design_to_torch_data.py --dataset iccad2019
+# python convert_design_to_torch_data.py --dataset ispd2019
\ No newline at end of file
diff --git a/data/fix_ispd2015_route.py b/data/fix_ispd2015_route.py
index c06ed2f..d09bedf 100644
--- a/data/fix_ispd2015_route.py
+++ b/data/fix_ispd2015_route.py
@@ -9,8 +9,12 @@
 dataset_root = "./raw"
 # remove fence region related information, including GROUP, REGION
 removeDefFence = True 
-# Since there are so many DRCs caused by SNet Vias (spacing) when routing by nanoroute, 
-# we remove these SNet Vias to avoid them. Note that such modification won't affect placement.
+# Due to numerous DRVs caused by SNet Vias (spacing) after nanoroute routing, we have 
+# removed these vias to address the issue. It is likely that these vias are oversized, 
+# directly violating the spacing rule. While this adjustment has no significant impact 
+# on placement, it does result in an open SNet. We sincerely encourage and appreciate 
+# contributions towards resolving this issue. Your contribution is highly valued and 
+# appreciated.
 removeDefSNetVias = True
 # Some fixed cell is not placed on manufacture grid, we move them to the nearest grid
 fixDefPlaceOnManGrid = True # only for mgc_fft_b, mgc_matrix_mult_b
@@ -27,8 +31,9 @@
 # Add default to fix nanoroute via error. 
 # FIXME: I dont' know which vias in superblue should be set with default...
 addDefaultForLefVia = True # only for mgc_superblue_*
-# PG Nets will cause a lot of violations with macro OBS, use EXCEPTPGNET to avoid that
+# PG Nets will cause a lot of violations with macro OBS/Route Blkg, use EXCEPTPGNET to avoid that
 eceptPGNetsForObs = True
+eceptPGNetsForBlkg = True
 
 
 def generate_one_raw_design(input_root, output_root, design_name):
@@ -207,6 +212,7 @@ def generateDefContent(defLines, design_name):
     
     isSNet, isVia, isNdr = False, False, False
     isRegion, isGroup = False, False
+    isBlockages = False
     jumpToEndComponents = False
     for lid, line in enumerate(defLines):
         if fixDefTracksLayers:
@@ -266,6 +272,15 @@ def generateDefContent(defLines, design_name):
             if isGroup or isRegion:
                 continue
 
+        if eceptPGNetsForBlkg:
+            if "BLOCKAGES" in line and "END" not in line:
+                isBlockages = True
+            if "END BLOCKAGES" in line:
+                isBlockages = False
+            if isBlockages:
+                if "LAYER" in line:
+                    line = line.replace("\n", " + EXCEPTPGNET\n")
+
         if fixDefPlaceOnManGrid and "fft_b" in design_name:
             if "+ PLACED ( 661359 799490 ) N" in line:
                 line = line.replace("661359", "661360")
diff --git a/data/remove_fence_in_ispd19_test5.py b/data/remove_fence_in_ispd19_test5.py
new file mode 100644
index 0000000..4c46c5b
--- /dev/null
+++ b/data/remove_fence_in_ispd19_test5.py
@@ -0,0 +1,75 @@
+# This script removes the fence region in ispd2019_test5
+
+import os
+import sys
+sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), ".."))
+
+# set dataset root
+dataset_root = "./raw"
+# remove fence region related information, including GROUP, REGION
+removeDefFence = True
+
+
+def generate_one_raw_design(input_root, output_root, design_name):
+    print("==============")
+    design_root = os.path.join(input_root, design_name)
+    #################### lef ####################
+    defFile = os.path.join(design_root, "%s.input.def" % design_name)
+    lefFile = os.path.join(design_root, "%s.input.lef" % design_name)
+    outputLef = "%s/%s/%s.input.lef" % (output_root, design_name, design_name)
+    if not os.path.exists(os.path.dirname(outputLef)):
+        os.makedirs(os.path.dirname(outputLef))
+
+    if design_name == "ispd19_test5":
+        with open(defFile) as t:
+            defLines = t.readlines()
+
+        #################### def ####################
+        defoutputFile = "%s/%s/%s.input.def" % (output_root, design_name, design_name)
+
+        defcontent = generateDefContent(defLines, design_name)
+
+        with open(defoutputFile, 'w') as f:
+            f.write(defcontent)
+        
+        print(defoutputFile)
+    else:
+        os.system("cp %s %s/%s/%s.input.def" % (defFile, output_root, design_name, design_name))
+
+    os.system("cp %s %s/%s/%s.input.lef" % (lefFile, output_root, design_name, design_name))
+
+def generateDefContent(defLines, design_name):
+    defcontent = ""
+    
+    isRegion, isGroup = False, False
+    for lid, line in enumerate(defLines):
+
+        if removeDefFence:
+            if "REGIONS" in line and "END" not in line:
+                isRegion = True
+                continue
+            if "END REGIONS" in line:
+                isRegion = False
+                continue
+            if "GROUPS" in line and "END" not in line:
+                isGroup = True
+                continue
+            if "END GROUPS" in line:
+                isGroup = False
+                continue
+            if isGroup or isRegion:
+                continue
+
+        defcontent += line
+    return defcontent
+
+if __name__ == "__main__":
+    raw_root = os.path.join(dataset_root, "ispd2019")
+    all_designs = os.listdir(raw_root)
+    sorted(all_designs)
+
+    # Raw design clean
+    print("-------------------------------------------------")
+    output_root = os.path.join(dataset_root, "ispd2019_no_fence")
+    for design_name in all_designs:
+        generate_one_raw_design(raw_root, output_root, design_name)
\ No newline at end of file
diff --git a/main.py b/main.py
index fee5503..91a29ab 100644
--- a/main.py
+++ b/main.py
@@ -39,6 +39,7 @@ def get_option():
 
     # global routing params
     parser.add_argument('--use_cell_inflate', type=str2bool, default=False, help='use cell inflation')
+    parser.add_argument('--min_area_inc', type=float, default=0.01, help='threshold of terminating inflation')
     parser.add_argument('--use_route_force', type=str2bool, default=False, help='use routing force')
     parser.add_argument('--route_freq', type=int, default=1000, help='routing freq')
     parser.add_argument('--num_route_iter', type=int, default=400, help='number of routing iters')
@@ -54,7 +55,9 @@ def get_option():
     parser.add_argument('--final_route_eval', type=str2bool, default=False, help='eval placement solution by GR')
 
     # logging and saver
-    parser.add_argument('--log_freq', type=int, default=100) 
+    parser.add_argument('--log_freq', type=int, default=100)
+    parser.add_argument('--verbose_cpp_log', type=str2bool, default=False, help='verbose cpp log for debugging')
+    parser.add_argument('--cpp_log_level', type=int, default=2, help='0: DEBUG, 1: VERBOSE, 2:INFO')
     parser.add_argument('--result_dir', type=str, default='result', help='log/model root directory') 
     parser.add_argument('--exp_id', type=str, default='', help='experiment id') 
     parser.add_argument('--log_dir', type=str, default='log', help='log directory') 
@@ -76,6 +79,10 @@ def get_option():
         print("We haven't yet support fence region in ispd2015, use ispd2015_fix instead")
         args.dataset = "ispd2015_fix"
 
+    if args.dataset == "ispd2019":
+        print("We haven't yet support fence region in ispd2019, use ispd2019_no_fence instead")
+        args.dataset = "ispd2019_no_fence"
+
     if args.custom_path != "":
         get_custom_design_params(args)
 
@@ -92,4 +99,4 @@ def main():
     
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/src/calculator.py b/src/calculator.py
index 514ffbf..2c1659c 100644
--- a/src/calculator.py
+++ b/src/calculator.py
@@ -58,7 +58,7 @@ def calc_obj_and_grad(
             data.hyperedge_list, data.hyperedge_list_end, data.net_mask, 
             data.hpwl_scale, ps.wa_coeff, args.deterministic
         )
-        mov_node_pos.grad[mov_lhs:mov_rhs] = conn_node_grad_by_wl[mov_lhs:mov_rhs]
+        mov_node_pos.grad[mov_lhs:mov_rhs] += conn_node_grad_by_wl[mov_lhs:mov_rhs]
         if ps.enable_sample_force:
             if ps.iter > 3 and ps.iter % 20 == 0:
                 # ps.iter > 3 for warmup
diff --git a/src/core/route_force.py b/src/core/route_force.py
index f5cd221..0cf0791 100644
--- a/src/core/route_force.py
+++ b/src/core/route_force.py
@@ -26,9 +26,11 @@ def __init__(self) -> None:
         self.original_filler_area_total = None
         self.original_pin_rel_cpos: torch.Tensor = None
         self.original_target_density = None
+        self.original_total_mov_area_without_filler = None
         self.original_num_fillers = None
         self.original_mov_node_size: torch.Tensor = None
         self.original_mov_node_size_real: torch.Tensor = None
+        self.original_init_density_map: torch.Tensor = None
 
     def reset(self):
         self.first_run = True
@@ -44,9 +46,11 @@ def reset(self):
         self.original_filler_area_total = None
         self.original_pin_rel_cpos = None
         self.original_target_density = None
+        self.original_total_mov_area_without_filler = None
         self.original_num_fillers = None
         self.original_mov_node_size = None
         self.original_mov_node_size_real = None
+        self.original_init_density_map = None
 
 
 route_cache = RouteCache()
@@ -471,13 +475,21 @@ def filler_pseudo_wire_force(data, ps, mov_node_pos, mov_node_size, routeforce,
 def route_inflation(
     args, logger, data, rawdb, gpdb, ps, mov_node_pos, mov_node_size, expand_ratio,
     constraint_fn=None, skip_m1_route=True, use_weighted_inflation=True, hv_same_ratio=True,
-    min_area_inc=0.01, decrease_target_density=False, **kwargs
+    dynamic_target_density=True, **kwargs
 ):
     mov_lhs, mov_rhs = data.movable_index
     fix_lhs, fix_rhs = data.fixed_connected_index
     _, filler_lhs = data.movable_connected_index
     filler_rhs = mov_node_pos.shape[0]
     num_fillers = filler_rhs - filler_lhs
+    decrease_target_density = False
+
+    # FIXME: How can we dynamically set args.min_area_inc according to the congestion map?
+    #        I believe there should be elegant ways to do that like computing local statistics...
+    if args.design_name == "ispd18_test10":
+        # This design is extremely congested in its center.
+        # Temporarily hard coded. This value may not be the best...
+        args.min_area_inc = 0.001
 
     if route_cache.first_run:
         # TODO: check args.target_density should be changed or not?
@@ -500,6 +512,8 @@ def route_inflation(
         route_cache.original_filler_area_total = torch.sum(torch.prod(filler_size, 1)).item()
         route_cache.original_pin_rel_cpos = data.pin_rel_cpos.clone()
         route_cache.original_target_density = copy.deepcopy(args.target_density)
+        route_cache.original_total_mov_area_without_filler = copy.deepcopy(data.__total_mov_area_without_filler__)
+        route_cache.original_init_density_map = data.init_density_map.clone()
 
     ori_mov_node_size = route_cache.original_mov_node_size
     ori_mov_node_size_real = route_cache.original_mov_node_size_real
@@ -508,6 +522,7 @@ def route_inflation(
     # 1) check remain space
     last_mov_area = torch.prod(mov_node_size_real[mov_lhs:filler_lhs], 1)
     last_mov_area_total = last_mov_area.sum().item()
+    last_filler_area_total = torch.sum(torch.prod(mov_node_size_real[filler_lhs:filler_rhs], 1)).item()
     max_inc_area_total = min(0.1 * route_cache.whitespace_area, route_cache.placeable_area - last_mov_area_total) # TODO: tune
     if max_inc_area_total <= 0:
         logger.warning("No space to inflate. Terminate inflation.")
@@ -521,6 +536,7 @@ def route_inflation(
         **kwargs
     )
     grdb, routeforce, input_mat, cg_mapHV, _, _, route_gradmat, gr_metrics = output
+    numOvflNets, gr_wirelength, gr_numVias, gr_numShorts, rc_hor_mean, rc_ver_mean = gr_metrics
 
     num_bin_x, num_bin_y = input_mat.shape[0], input_mat.shape[1]
     unit_len_x, unit_len_y = routeforce.gcell_steps()
@@ -534,6 +550,13 @@ def route_inflation(
         inflate_mat: torch.Tensor = torch.stack((input_mat + 1, input_mat + 1)).contiguous().pow_(2)
     else:
         inflate_mat: torch.Tensor = (cg_mapHV + 1).permute(0, 2, 1).contiguous()
+    if dynamic_target_density and last_filler_area_total / last_mov_area_total > 1.1 and numOvflNets / data.num_nets > 0.04:
+        # filler area too large => low utilization, can adjust size more aggressively to reduce GR overflow
+        max_inc_area_total = route_cache.placeable_area - last_mov_area_total
+        logger.info("Low utilization detect, globally inflate...")
+        global_ratio = (rc_hor_mean + rc_ver_mean) / 2
+        inflate_mat *= global_ratio
+        decrease_target_density = True
     inflate_mat.clamp_(min=1.0, max=2.0)
 
     # NOTE: 1) If use_weighted_inflation == False, use max congestion as inflation ratio.
@@ -556,6 +579,10 @@ def route_inflation(
     inc_mov_area = expect_new_mov_area - last_mov_area
     inc_mov_area_total = inc_mov_area.sum().item()
     inc_area_scale = max_inc_area_total / inc_mov_area_total
+    if inc_mov_area_total <= 0:
+        logger.warning("Negative area increment %.4f. Early terminate cell inflation." % (inc_mov_area_total))
+        ps.use_cell_inflate = False  # not inflation anymore
+        return gr_metrics, None, None
     if inc_area_scale < 1:
         logger.warning("Not enough space to inflate. Scale down.")
         inc_mov_area *= inc_area_scale
@@ -569,10 +596,10 @@ def route_inflation(
     new_mov_node_size_real: torch.Tensor = mov_node_size_real.clone()
     new_mov_node_size_real[mov_lhs:filler_lhs].mul_(this_mov_conn_inflate_ratio)
 
-    if inc_mov_area_total / last_mov_area_total < min_area_inc:
+    if inc_mov_area_total / last_mov_area_total < args.min_area_inc:
         logger.warning(
             "Too small relative area increment (%.4f < %.4f). Early terminate cell inflation." % (
-                inc_mov_area_total / last_mov_area_total, min_area_inc
+                inc_mov_area_total / last_mov_area_total, args.min_area_inc
         ))
         ps.use_cell_inflate = False  # not inflation anymore
         return gr_metrics, None, None
@@ -580,13 +607,13 @@ def route_inflation(
     # 5) update total filler inflation ratio
     new_mov_area_total = last_mov_area_total + inc_mov_area_total
     new_filler_area_total = 0.0
-    last_filler_area_total = torch.sum(torch.prod(mov_node_size_real[filler_lhs:filler_rhs], 1)).item()
     filler_scale = 0.0
     if new_mov_area_total + last_filler_area_total > route_cache.target_area:
         new_filler_area_total = max(route_cache.target_area - new_mov_area_total, 0)
-        if decrease_target_density and new_filler_area_total / route_cache.placeable_area > 0.2:
-            # remove some pre-inserted fillers / FloatMov nodes to decrease the target density
-            new_target_density = max(0.8, 0.85 * 1.0)
+        if decrease_target_density:
+            logger.info("Removing some pre-inserted fillers / FloatMov nodes to decrease the target density...")
+            # standard cell density: 1 / (1 + 1.1) = 0.4762
+            new_target_density = max(0.4762, 0.85 * args.target_density)
             new_target_area = new_target_density * route_cache.placeable_area
             new_filler_area_total = max(new_target_area - new_mov_area_total, 0)
             filler_scale = new_filler_area_total / route_cache.original_filler_area_total
@@ -594,11 +621,9 @@ def route_inflation(
             num_remain_cells = filler_lhs + math.ceil(filler_scale * original_num_fillers)
 
             route_cache.target_area = new_target_area
-            new_mov_node_size_real = new_mov_node_size_real[:num_remain_cells]
-            mov_node_size_real = mov_node_size_real[:num_remain_cells]
-            logger.warning("Remove nodes to reduce target density from %.4f to %.4f. \
-                        This step may remove some FloatMov. #Nodes from %d to %d" % 
-                        (old_target_density, args.target_density, filler_rhs, num_remain_cells))
+            # set filler size as 0 to remove
+            new_mov_node_size_real[num_remain_cells:] = 0
+            mov_node_size_real[num_remain_cells:] = 0
         elif route_cache.original_filler_area_total > 0:
             filler_scale = math.sqrt(new_filler_area_total / route_cache.original_filler_area_total)
             new_mov_node_size_real[filler_lhs:filler_rhs] = filler_scale * ori_mov_node_size_real[filler_lhs:filler_rhs]
@@ -620,7 +645,19 @@ def route_inflation(
     args.target_density = (new_mov_area_total + new_filler_area_total) / route_cache.placeable_area
     logger.info("Update target density from %.4f to %.4f" % (old_target_density, args.target_density))
 
-    logger.info("Relative area | increment: %.4f, mov: %.4f, filler: %.4f, all_cells: %.4f" % (
+    if decrease_target_density:
+        # since we decrease target density, update the init density map correspondingly
+        logger.warning("Remove nodes to reduce target density from %.4f to %.4f. This step may remove some FloatMov. #Nodes from %d to %d" % 
+            (old_target_density, args.target_density, filler_rhs, num_remain_cells))
+        data.init_density_map.clamp_(min=0.0, max=1.0).div_(old_target_density).mul_(args.target_density)
+
+    logger.info("Absolute area (last) | mov: %.4E, filler: %.4E, all_cells: %.4E" % (
+        last_mov_area_total, last_filler_area_total, last_mov_area_total + last_filler_area_total
+    ))
+    logger.info("Absolute area (this) | mov: %.4E, filler: %.4E, all_cells: %.4E" % (
+        new_mov_area_total, new_filler_area_total, new_mov_area_total + new_filler_area_total
+    ))
+    logger.info("Relative area change | increment: %.4f, mov: %.4f, filler: %.4f, all_cells: %.4f" % (
         inc_mov_area_total / last_mov_area_total,
         new_mov_area_total / last_mov_area_total,
         new_filler_area_total / last_filler_area_total if last_filler_area_total > 1e-5 else 0,
@@ -656,6 +693,11 @@ def route_inflation(
         expand_ratio = mov_node_area / clamp_mov_node_area
         mov_node_size = clamp_mov_node_size
 
+    if decrease_target_density:
+        # FIXME: should we update total_mov_area_without_filler when decrease_target_density == False?
+        mov_cell_area = torch.prod(new_mov_node_size_real[mov_lhs:mov_rhs, ...], 1)
+        data.__total_mov_area_without_filler__ = torch.sum(mov_cell_area).item()
+
     ps.use_cell_inflate = True
     return gr_metrics, mov_node_size, expand_ratio
 
@@ -665,5 +707,8 @@ def route_inflation_roll_back(args, logger, data, mov_node_size):
         mov_lhs, mov_rhs = data.movable_index
         mov_node_size[mov_lhs:mov_rhs].copy_(route_cache.original_mov_node_size[mov_lhs:mov_rhs])
         data.pin_rel_cpos.copy_(route_cache.original_pin_rel_cpos)
-        args.target_density = route_cache.original_target_density
+        data.__total_mov_area_without_filler__ = route_cache.original_total_mov_area_without_filler
+        if args.target_density != route_cache.original_target_density:
+            args.target_density = route_cache.original_target_density
+            data.init_density_map.copy_(route_cache.original_init_density_map)
     route_cache.reset()
diff --git a/src/database.py b/src/database.py
index 7a2587d..b1182b8 100644
--- a/src/database.py
+++ b/src/database.py
@@ -17,14 +17,16 @@ def load_dataset(args, logger, placement=None):
     if args.load_from_raw:
         logger.info("loading from original benchmark...")
         rawdb, gpdb = parser.read(
-            params, verbose_log=False, lite_mode=True, random_place=False, num_threads=args.num_threads
+            params, verbose_log=args.verbose_cpp_log, log_level=args.cpp_log_level,
+            lite_mode=True, random_place=False, num_threads=args.num_threads
         )
         design_info = parser.preprocess_design_info(gpdb)
     else:
         logger.info("loading from pt benchmark...")
         design_pt_path = "./data/cad/%s/%s.pt" % (args.dataset, args.design_name)
         parser.load_params(
-            params, verbose_log=False, lite_mode=True, random_place=False, num_threads=args.num_threads
+            params, verbose_log=args.verbose_cpp_log, log_level=args.cpp_log_level,
+            lite_mode=True, random_place=False, num_threads=args.num_threads
         )
         design_info = torch.load(design_pt_path)
         gpdb = None
@@ -140,10 +142,11 @@ def __init__(
         self.__site_height__ = site_info[1]
         self.__row_height__ = site_info[1]  # the same as site height
 
-        self.__ori_die_lx__ = die_info[0].item()
-        self.__ori_die_hx__ = die_info[1].item()
-        self.__ori_die_ly__ = die_info[2].item()
-        self.__ori_die_hy__ = die_info[3].item()
+        lx, hx, ly, hy = die_info.cpu().numpy()
+        self.__ori_die_lx__ = lx
+        self.__ori_die_hx__ = hx
+        self.__ori_die_ly__ = ly
+        self.__ori_die_hy__ = hy
 
         self.__num_nodes__ = node_pos.shape[0]
         self.__num_pins__ = pin_id2node_id.shape[0]
@@ -489,7 +492,7 @@ def backup_ori_var(self):
 
     def preshift(self):
         # shift die info to (0.0, hx, 0.0, hy)
-        die_lx, _, die_ly, _ = self.die_info.tolist()
+        die_lx, _, die_ly, _ = self.die_info.cpu().numpy()
         die_shift = torch.tensor(
             [die_lx, die_ly], dtype=self.die_info.dtype, device=self.die_info.device,
         )
@@ -519,7 +522,7 @@ def prescale_by_site_width(self):
 
     def prescale(self):
         # scale die info to (0.0, 1.0, 0.0, 1.0)
-        die_lx, die_hx, die_ly, die_hy = self.die_info.tolist()
+        die_lx, die_hx, die_ly, die_hy = self.die_info.cpu().numpy()
         die_scale = torch.tensor(
             [die_hx - die_lx, die_hy - die_ly],
             dtype=self.die_info.dtype,
@@ -543,10 +546,11 @@ def prescale(self):
     def pre_compute_var(self):
         args = self.__args__
         device = self.node_size.get_device()
+        dtype = self.node_size.dtype
         # die related
-        lx, hx, ly, hy = self.die_info.tolist()
+        lx, hx, ly, hy = self.die_info.cpu().numpy()
         self.unit_len = torch.tensor(
-            [(hx - lx) / self.num_bin_x, (hy - ly) / self.num_bin_y], device=device
+            [(hx - lx) / self.num_bin_x, (hy - ly) / self.num_bin_y], device=device, dtype=dtype
         )
         self.die_ur = self.die_info.reshape(2, 2).t()[1].clone()
         self.die_ll = self.die_info.reshape(2, 2).t()[0].clone()
@@ -742,7 +746,7 @@ def logging_statistics(self):
                 num_fltiopin,
             )
         )
-        content += "Core Info " + str(self.die_info.tolist()) + "\n"
+        content += "Core Info " + str([i for i in self.die_info.cpu().numpy()]) + "\n"
         content += "Site Width = %d, Row Height = %d\n" % (
             self.site_width,
             self.site_height,
diff --git a/src/detail_placement.py b/src/detail_placement.py
index 05084d6..c033949 100644
--- a/src/detail_placement.py
+++ b/src/detail_placement.py
@@ -145,14 +145,21 @@ def rearrange_dpdb_node_info(node_pos: torch.Tensor, data: PlaceData):
 
 
 def setup_detailed_rawdb(
-    node_pos: torch.Tensor, use_cpu_db_: bool, data: PlaceData, args, logger
+    node_pos: torch.Tensor, use_cpu_db_: bool, data: PlaceData, args, logger, after_lg=True
 ):
     curr_site_width = 1.0  # prescale_by_site_width
     node_lpos, node_size, node_weight, pin_id2node_id, node2pin_list, node2pin_list_end = rearrange_dpdb_node_info(
         node_pos, data
     )
+    if after_lg:
+        # NOTE: we assume all legalized cells are on integer system
+        # this step can avoid some potential floating-point precision errors
+        _, floatmov_rhs, _ = data.node_type_indices[1]
+        inv_scalar = round(1.0 / get_ori_scale_factor(data))
+        node_lpos[:floatmov_rhs].mul_(inv_scalar).round_().div_(inv_scalar)
 
     mov_lhs, mov_rhs = data.movable_index
+    conn_mov_lhs, conn_mov_rhs = data.movable_connected_index
     if args.scale_design:
         # scale back
         die_scale = data.die_scale / data.site_width # assume site width == 1 in dp
@@ -179,12 +186,9 @@ def setup_detailed_rawdb(
     num_iopin = iopin_rhs - fix_rhs
     num_floatiopin = floatiopin_rhs - blkg_rhs
 
-    die_info = die_info.cpu()
-    xl = die_info[0].item()
-    xh = die_info[1].item()
-    yl = die_info[2].item()
-    yh = die_info[3].item()
+    xl, xh, yl, yh = die_info.cpu().numpy()
     num_movable_nodes = mov_rhs - mov_lhs
+    num_conn_movable_nodes = conn_mov_rhs - conn_mov_lhs
     num_nodes = node_lpos.shape[0] - num_iopin - num_floatiopin
     site_width = curr_site_width
     row_height = data.row_height / data.site_width
@@ -213,6 +217,7 @@ def setup_detailed_rawdb(
             xh,
             yl,
             yh,
+            num_conn_movable_nodes,
             num_movable_nodes,
             num_nodes,
             site_width,
@@ -238,6 +243,7 @@ def setup_detailed_rawdb(
             xh,
             yl,
             yh,
+            num_conn_movable_nodes,
             num_movable_nodes,
             num_nodes,
             site_width,
@@ -296,7 +302,7 @@ def commit_to_node_pos(node_pos: torch.Tensor, data:PlaceData, dp_rawdb):
 
 def run_lg(node_pos: torch.Tensor, data: PlaceData, args, logger):
     # CPU legalization
-    lg_rawdb = setup_detailed_rawdb(node_pos, True, data, args, logger)
+    lg_rawdb = setup_detailed_rawdb(node_pos, True, data, args, logger, after_lg=False)
 
     # run LG
     logger.info("Start running Macro Legalization...")
@@ -306,13 +312,38 @@ def run_lg(node_pos: torch.Tensor, data: PlaceData, args, logger):
         lg_rawdb.commit()
     logger.info("Finish Macro Legalization. Time: %.4f" % (time.time() - ml_time))
 
+    total_cell_area = torch.sum(torch.prod(data.node_size, 1)).item()
+    die_area = torch.prod(data.die_ur - data.die_ll).item()
+    is_high_util = (total_cell_area / die_area) > 0.999
+    logger.info("Utilization: %.2f" % (total_cell_area / die_area))
+
     logger.info("Start running Greedy Legalization...")
     gl_time = time.time()
     num_bins_x, num_bins_y = 1, 64
-    gpudp.greedyLegalization(lg_rawdb, num_bins_x, num_bins_y)
-    if not lg_rawdb.check(get_ori_scale_factor(data)):
-        logger.error("Check failed in Greedy Legalization")
-    logger.info("Finish Greedy Legalization. Time: %.4f" % (time.time() - gl_time))
+    if not is_high_util:
+        gpudp.greedyLegalization(lg_rawdb, num_bins_x, num_bins_y, True)
+    if is_high_util or not lg_rawdb.check(get_ori_scale_factor(data)):
+        logger.warning("Check failed in Greedy Legalization. Re-try by Greedy + Filler Legalization.")
+        
+        # NOTE: this greedy legalization only legalizes movable connected cells
+        logger.info("Start running Greedy Legalization...")
+        gpudp.greedyLegalization(lg_rawdb, num_bins_x, num_bins_y, False)
+        # NOTE: filler legalization only legalizes movable unconnected cells (fillers)
+        logger.info("Start Filler Legalization...")
+        gpudp.fillerLegalization(lg_rawdb)
+        if not lg_rawdb.check(get_ori_scale_factor(data)):
+            logger.error("Check failed in Greedy + Filler Legalization.")
+        logger.info("Finish Greedy + Filler Legalization. Time: %.4f" % (time.time() - gl_time))
+    else:
+        logger.info("Finish Greedy Legalization. Time: %.4f" % (time.time() - gl_time))
+
+    # # Commit result
+    # commit_to_node_pos(node_pos, data, lg_rawdb)
+    # torch.cuda.synchronize(node_pos.device)
+    # if args.scale_design:
+    #     node_pos /= data.die_scale
+    # info = (-1, 0, data.design_name)
+    # draw_fig_with_cairo_cpp(node_pos, data.node_size, data, info, args, base_size=4096)
 
     logger.info("Start running Abacus Legalization...")
     al_time = time.time()
@@ -380,14 +411,16 @@ def run_dp(node_pos: torch.Tensor, data: PlaceData, args, logger):
     ism_iter = 50
 
     # use integer coordinate systems in DP for better quality
-    scalar = compute_scalar(get_ori_scale_factor(data))
+    # scalar = compute_scalar(get_ori_scale_factor(data))
+    scalar = 1.0
 
     def dp_handler(dp_func, func_name, *func_args):
         logger.info("Start running %s..." % func_name)
         start_time = time.time()
         if scalar != 1.0:
-            logger.info("scale dp_rawdb by %g" % (1.0 / scalar))
-            dp_rawdb.scale(1.0 / scalar, True)
+            # NOTE: we assume site_width is integer, so 1 / scalar should be an integer
+            logger.info("scale dp_rawdb by %g" % round(1.0 / scalar))
+            dp_rawdb.scale(round(1.0 / scalar), True)
         dp_func(dp_rawdb, *func_args)
         if scalar != 1.0:
             logger.info("scale dp_rawdb back by %g" % scalar)
@@ -431,6 +464,11 @@ def run_dp_route_opt(node_pos: torch.Tensor, gpdb, rawdb, ps, data: PlaceData, a
             node_pos[:mov_rhs].detach() - data.node_size[:mov_rhs] / 2,
             data.node_lpos[mov_rhs:],
         ), dim=0).cpu()
+        # this step can avoid some potential floating-point precision errors
+        _, floatmov_rhs, _ = data.node_type_indices[1]
+        inv_scalar = round(1.0 / get_ori_scale_factor(data))
+        node_lpos[:floatmov_rhs].mul_(inv_scalar).round_().div_(inv_scalar)
+
         node_size = data.node_size.cpu()
         if args.scale_design:
             # scale back
@@ -441,10 +479,7 @@ def run_dp_route_opt(node_pos: torch.Tensor, gpdb, rawdb, ps, data: PlaceData, a
         site_width = 1.0
         row_height = data.row_height / data.site_width
         die_info = data.die_info.cpu()
-        dieLX = die_info[0].item()
-        dieHX = die_info[1].item()
-        dieLY = die_info[2].item()
-        dieHY = die_info[3].item()
+        dieLX, dieHX, dieLY, dieHY = die_info.numpy()
         K = 5
         new_node_lpos = routedp.dp_route_opt(
             node_lpos, node_size, dieLX, dieHX, dieLY, dieHY, 
diff --git a/src/initializer.py b/src/initializer.py
index 6f4e35c..52b5d1f 100644
--- a/src/initializer.py
+++ b/src/initializer.py
@@ -54,6 +54,24 @@ def get_init_density_map(rawdb, gpdb, data: PlaceData, args, logger):
                 args.deterministic
             )
             init_density_map += snet_density_map.contiguous()
+
+            # compute utilization to determine whether there are sufficient space to enlarge snet density
+            mov_lhs, mov_rhs = data.movable_index
+            mov_node_size = data.node_size[mov_lhs:mov_rhs, ...]
+            total_mov_cell_area = torch.sum(torch.prod(mov_node_size, 1)).item()
+            die_area = torch.prod(data.die_ur - data.die_ll)
+            fixed_node_area = init_density_map.clamp_(min=0.0, max=1.0).sum() * data.bin_area
+            placeable_area = die_area - fixed_node_area
+            total_filler_area = max(args.target_density * placeable_area - total_mov_cell_area, 0.0)
+            if total_filler_area / total_mov_cell_area > 2:
+                # low utilization, can enlarge snet density to further enhance routability
+                tmp_map = init_density_map + (snet_density_map.contiguous() > 0.1).float()
+                diff_area = (tmp_map.clamp_(min=0.0, max=1.0).sum() * data.bin_area - fixed_node_area).item()
+                if diff_area < total_filler_area * 0.5:
+                    logger.info("Low utilization detect, enlarge snet density.")
+                    init_density_map += (snet_density_map.contiguous() > 0.1).float()
+                    init_density_map.clamp_(min=0.0, max=1.0)
+
     init_density_map.clamp_(min=0.0, max=1.0).mul_(args.target_density)
     if args.use_route_force or args.use_cell_inflate:
         # inflate connected IOPins
diff --git a/src/param_scheduler.py b/src/param_scheduler.py
index af2874e..aac3c08 100644
--- a/src/param_scheduler.py
+++ b/src/param_scheduler.py
@@ -114,6 +114,8 @@ def __init__(self, data: PlaceData, args, logger) -> None:
         self.stop_overflow = args.stop_overflow
         self.skip_update = False if args.enable_skip_update else None
         self.enable_fence = data.enable_fence
+        self.min_enlarge_density_interval = 1000
+        self.last_enlarge_density_iter = -self.min_enlarge_density_interval
         # skip density force
         self.enable_sample_force = True
         self.force_ratio = 0.0
@@ -202,7 +204,7 @@ def step(self, hpwl, overflow, node_pos, data):
             #     self.skip_update = np.random.random() > (np.random.randn() * 0.08 + 0.4)
             # if self.density_weight > 0.1 or self.recorder.overflow[-1] < 0.2: # 2021 11 13 best
             #     self.skip_update = np.random.random() > 0.4
-            if self.weighted_weight > 0.5 and self.weighted_weight < 0.99:
+            if self.weighted_weight > 0.5 and self.weighted_weight < 0.95:
                 self.skip_update = ((self.iter - self.init_iter) % 3 != 0)
             elif self.iter - self.init_iter < 50:
                 # slow down the param update of early stage
@@ -230,6 +232,19 @@ def step_density_weight(self):
             self.mu = 1.05 * np.clip(np.power(1.05, -delta_hpwl / 350000), 0.95, 1.05)
         self.density_weight *= self.mu
 
+        if (
+            not self.enable_fence and 
+            self.iter > 15 and
+            self.iter - self.last_enlarge_density_iter > self.min_enlarge_density_interval and
+            self.check_plateau(self.recorder.overflow, window=25, threshold=0.001)
+        ):
+            if self.recorder.overflow[-1] > 0.9:
+                self.last_enlarge_density_iter = self.iter
+                self.density_weight *= 2
+                self.__logger__.warning(
+                    "Detect plateau at early stage, enlarge density_weight. Iter: %d" % 
+                self.iter)
+
     def param_smooth_func(self, input, r=0.2, half_iter=30, end_iter=400):
         logistic = lambda x,k,x_0: 1 / (1 + math.exp(-k * (x - x_0)))
         lhs = 1 - logistic(input, r, end_iter - half_iter)
diff --git a/src/run_placement_nesterov.py b/src/run_placement_nesterov.py
index 60ca586..fb5fd70 100644
--- a/src/run_placement_nesterov.py
+++ b/src/run_placement_nesterov.py
@@ -137,6 +137,7 @@ def calc_route_force(mov_node_pos, mov_node_size, expand_ratio, constraint_fn):
     #             p.step()
     # exit(0)
     terminate_signal = False
+    route_early_terminate_signal = False
     for iteration in range(args.inner_iter):
         # optimizer.zero_grad() # zero grad inside obj_and_grad_fn
         obj = optimizer.step(obj_and_grad_fn)
@@ -189,7 +190,9 @@ def calc_route_force(mov_node_pos, mov_node_size, expand_ratio, constraint_fn):
                     constraint_fn=trunc_node_pos_fn,
                 )  # ps.use_cell_inflate is updated in route_inflation
                 if not ps.use_cell_inflate:
-                    logger.info("Stop cell inflation...")
+                    route_early_terminate_signal = True
+                    terminate_signal = True
+                    logger.info("Early stop cell inflation...")
                 if output is not None:
                     gr_metrics, new_mov_node_size, new_expand_ratio = output
                     ps.push_gr_sol(gr_metrics, hpwl, overflow, mov_node_pos)
@@ -240,7 +243,7 @@ def calc_route_force(mov_node_pos, mov_node_size, expand_ratio, constraint_fn):
                 )
                 ps.reset_best_sol()
 
-        if iteration % args.log_freq == 0 or iteration == args.inner_iter - 1 or ps.rerun_route:
+        if iteration % args.log_freq == 0 or iteration == args.inner_iter - 1 or ps.rerun_route or terminate_signal:
             log_str = (
                 "iter: %d | masked_hpwl: %.2E overflow: %.4f obj: %.4E "
                 "density_weight: %.4E wa_coeff: %.4E"
@@ -287,13 +290,14 @@ def calc_route_force(mov_node_pos, mov_node_size, expand_ratio, constraint_fn):
         mov_node_pos[mov_lhs:mov_rhs].data.copy_(best_sol[mov_lhs:mov_rhs])
     if ps.enable_route:
         route_inflation_roll_back(args, logger, data, mov_node_size)
-        ps.rerun_route = True
-        gr_metrics = run_gr_and_fft_main(
-            args, logger, data, rawdb, gpdb, ps, mov_node_pos, constraint_fn=trunc_node_pos_fn, 
-            skip_m1_route=True, report_gr_metrics_only=True
-        )
-        ps.rerun_route = False
-        ps.push_gr_sol(gr_metrics, hpwl, overflow, mov_node_pos)
+        if not route_early_terminate_signal:
+            ps.rerun_route = True
+            gr_metrics = run_gr_and_fft_main(
+                args, logger, data, rawdb, gpdb, ps, mov_node_pos, constraint_fn=trunc_node_pos_fn, 
+                skip_m1_route=True, report_gr_metrics_only=True
+            )
+            ps.rerun_route = False
+            ps.push_gr_sol(gr_metrics, hpwl, overflow, mov_node_pos)
         best_sol_gr = ps.get_best_gr_sol()
         mov_node_pos[mov_lhs:mov_rhs].data.copy_(best_sol_gr[mov_lhs:mov_rhs])
 
diff --git a/tool/innovus_ispd2015_fix/parse_log.py b/tool/innovus_ispd2015_fix/parse_log.py
index c1a6939..32fdb62 100644
--- a/tool/innovus_ispd2015_fix/parse_log.py
+++ b/tool/innovus_ispd2015_fix/parse_log.py
@@ -41,8 +41,8 @@
         if "End route_design" in line:
             DrCpuTimeStr = line.split("total cpu=")[-1].split(", ")[0]
             DrTotalTimeStr = line.split("real=")[-1].split(", ")[0]
-            DrCpuTime = sum(x * int(t) for x, t in zip([3600, 60, 1], DrCpuTimeStr.split(":"))) # seconds
-            DrTotalTime = sum(x * int(t) for x, t in zip([3600, 60, 1], DrTotalTimeStr.split(":"))) # seconds
+            DrCpuTime = sum(x * int(float(t)) for x, t in zip([3600, 60, 1], DrCpuTimeStr.split(":"))) # seconds
+            DrTotalTime = sum(x * int(float(t)) for x, t in zip([3600, 60, 1], DrTotalTimeStr.split(":"))) # seconds
 
         if line.startswith("Wire Length Statistics :"):
             findTotalWL = True
diff --git a/utils/get_design_params.py b/utils/get_design_params.py
index 18a86d6..28a7095 100644
--- a/utils/get_design_params.py
+++ b/utils/get_design_params.py
@@ -6,9 +6,11 @@ def find_benchmark(dataset_root, benchmark):
         "ispd2005": os.path.join(dataset_root, "ispd2005"),
         "dac2012": os.path.join(dataset_root, "iccad2012dac2012"),
         "ispd2015": os.path.join(dataset_root, "ispd2015"),
-        "ispd2015_without_fence": os.path.join(dataset_root, "ispd2015_without_fence"),
         "ispd2015_fix": os.path.join(dataset_root, "ispd2015_fix"),
+        "ispd2018": os.path.join(dataset_root, "ispd2018"),
+        "ispd2019_no_fence": os.path.join(dataset_root, "ispd2019_no_fence"),
         "iccad2019": os.path.join(dataset_root, "iccad2019"),
+        "ispd2018": os.path.join(dataset_root, "ispd2018"),
     }
     root = bm_to_root[benchmark]
     all_designs = [i for i in os.listdir(root) if os.path.isdir(os.path.join(root, i))]
@@ -22,12 +24,16 @@ def get_single_design_params(dataset_root, benchmark, design_name, placement=Non
         return single_dac2012(dataset_root, design_name, placement)
     elif benchmark == "ispd2015":
         return single_ispd2015(dataset_root, design_name, placement)
-    elif benchmark == "ispd2015_without_fence":
-        return single_ispd2015_without_fence(dataset_root, design_name, placement)
     elif benchmark == "ispd2015_fix":
         return single_ispd2015_fix(dataset_root, design_name, placement)
+    elif benchmark == "ispd2018":
+        return single_ispd2018(dataset_root, design_name, placement)
+    elif benchmark == "ispd2019_no_fence":
+        return single_ispd2019_no_fence(dataset_root, design_name, placement)
     elif benchmark == "iccad2019":
         return single_iccad2019(dataset_root, design_name, placement)
+    elif benchmark == "ispd2018":
+        return single_ispd2018(dataset_root, design_name, placement)
     else:
         raise NotImplementedError("benchmark %s is not found" % benchmark)
 
@@ -87,32 +93,46 @@ def single_ispd2015(dataset_root, design_name, placement=None):
     return params
 
 
-def single_ispd2015_without_fence(dataset_root, design_name, placement=None):
+def single_ispd2015_fix(dataset_root, design_name, placement=None):
     # configuration
-    benchmark = "ispd2015_without_fence"
+    benchmark = "ispd2015_fix"
     root, all_designs = find_benchmark(dataset_root, benchmark)
     if design_name not in all_designs:
         raise ValueError("Design Name %s should in %s" % (design_name, all_designs))
     params = {
         "benchmark": benchmark,
-        "tech_lef": "%s/%s/tech.lef" % (root, design_name),
-        "cell_lef": "%s/%s/cells.lef" % (root, design_name),
-        "def": "%s/%s/floorplan.def" % (root, design_name) if placement is None else placement,
+        "lef": "%s/%s/%s.lef" % (root, design_name, design_name),
+        "def": "%s/%s/%s.def" % (root, design_name, design_name) if placement is None else placement,
         "design_name": design_name,
     }
     return params
 
 
-def single_ispd2015_fix(dataset_root, design_name, placement=None):
+def single_ispd2018(dataset_root, design_name, placement=None):
     # configuration
-    benchmark = "ispd2015_fix"
+    benchmark = "ispd2018"
     root, all_designs = find_benchmark(dataset_root, benchmark)
     if design_name not in all_designs:
         raise ValueError("Design Name %s should in %s" % (design_name, all_designs))
     params = {
         "benchmark": benchmark,
-        "lef": "%s/%s/%s.lef" % (root, design_name, design_name),
-        "def": "%s/%s/%s.def" % (root, design_name, design_name) if placement is None else placement,
+        "lef": "%s/%s/%s.input.lef" % (root, design_name, design_name),
+        "def": "%s/%s/%s.input.def" % (root, design_name, design_name) if placement is None else placement,
+        "design_name": design_name,
+    }
+    return params
+
+
+def single_ispd2019_no_fence(dataset_root, design_name, placement=None):
+    # configuration
+    benchmark = "ispd2019_no_fence"
+    root, all_designs = find_benchmark(dataset_root, benchmark)
+    if design_name not in all_designs:
+        raise ValueError("Design Name %s should in %s" % (design_name, all_designs))
+    params = {
+        "benchmark": benchmark,
+        "lef": "%s/%s/%s.input.lef" % (root, design_name, design_name),
+        "def": "%s/%s/%s.input.def" % (root, design_name, design_name) if placement is None else placement,
         "design_name": design_name,
     }
     return params
@@ -133,15 +153,35 @@ def single_iccad2019(dataset_root, design_name, placement=None):
     return params
 
 
+def single_ispd2018(dataset_root, design_name, placement=None):
+    # configuration
+    benchmark = "ispd2018"
+    root, all_designs = find_benchmark(dataset_root, benchmark)
+    if design_name not in all_designs:
+        raise ValueError("Design Name %s should in %s" % (design_name, all_designs))
+    params = {
+        "benchmark": benchmark,
+        "lef": "%s/%s/%s.input.lef" % (root, design_name, design_name),
+        "def": "%s/%s/%s.input.def" % (root, design_name, design_name)
+        if placement is None
+        else placement,
+        "design_name": design_name,
+    }
+    return params
+
+
 def get_custom_design_params(args):
-    params = dict([
-        [item.strip() for item in token.strip().split(":")] 
-        for token in args.custom_path.split(",") if len(token) > 0
-    ])
+    params = dict(
+        [
+            [item.strip() for item in token.strip().split(":")]
+            for token in args.custom_path.split(",")
+            if len(token) > 0
+        ]
+    )
     if "benchmark" not in params.keys():
         raise ValueError("Cannot find 'benchmark' in args.custom_path")
     if "design_name" not in params.keys():
         raise ValueError("Cannot find 'design_name' in args.custom_path")
     args.dataset = params["benchmark"]
     args.design_name = params["design_name"]
-    return params
\ No newline at end of file
+    return params
diff --git a/utils/io_parser.py b/utils/io_parser.py
index 15a82a8..5740bd9 100644
--- a/utils/io_parser.py
+++ b/utils/io_parser.py
@@ -14,11 +14,12 @@ def load_params(
         self,
         params: dict,
         verbose_log: bool = False,
+        log_level: int = 2,
         lite_mode: bool = False,
         random_place: bool = True,
         num_threads: int = 8,
     ):
-        check_status = self.check_params(params, verbose_log, lite_mode, random_place, num_threads)
+        check_status = self.check_params(params, verbose_log, log_level, lite_mode, random_place, num_threads)
         if not check_status:
             raise ValueError(
                 "Checking failure. Please check the validity of params: %s" % params
@@ -31,7 +32,7 @@ def load_params(
         return load_status
 
     def check_params(
-        self, params: dict, verbose_log: bool, lite_mode: bool, random_place: bool, num_threads: int = 8
+        self, params: dict, verbose_log: bool, log_level: int, lite_mode: bool, random_place: bool, num_threads: int = 8
     ) -> bool:
         if "def" not in params.keys() and "aux" not in params.keys():
             print("def or aux is not found!")
@@ -82,6 +83,14 @@ def check_params(
         else:
             self.params["verbose_parser_log"] = False
 
+        # === LOG_LEVEL ===
+        # 0: DEBUG, 1: VERBOSE, 2: INFO, 3: NOTICE, 4: WARN, 5: ERROR, 6: FATAL, 7: OK
+        if log_level < 0:
+            log_level = 0
+        elif log_level > 7:
+            log_level = 2
+        self.params["global_log_level"] = log_level
+
         if lite_mode:
             self.params["lite_mode"] = True
         else:
@@ -100,12 +109,13 @@ def read(
         self,
         params: dict,
         verbose_log: bool = False,
+        log_level: int = 2,
         lite_mode: bool = False,
         random_place: bool = True,
         num_threads: int = 8,
         debug: bool = False,
     ):
-        check_status = self.check_params(params, verbose_log, lite_mode, random_place, num_threads)
+        check_status = self.check_params(params, verbose_log, log_level, lite_mode, random_place, num_threads)
         if not check_status:
             raise ValueError(
                 "Checking failure. Please check the validity of params: %s" % params